dashboard-nanobot/dashboard-edge/app/services/runtime_service.py

512 lines
21 KiB
Python

import json
import os
import shlex
import shutil
import csv
from datetime import datetime, timezone
import psutil
from fastapi import HTTPException
from app.core.settings import EDGE_BOTS_WORKSPACE_ROOT, EDGE_NODE_ID, EDGE_NODE_NAME
from app.runtime.base import EdgeRuntimeBackend
from app.runtime.factory import build_edge_runtime_backends, preferred_edge_runtime_kind
from app.schemas.edge import (
EdgeCommandRequest,
EdgeLogsResponse,
EdgeMonitorEnsureResponse,
EdgeMonitorPacket,
EdgeMonitorPacketsResponse,
EdgeNodeHeartbeatResponse,
EdgeNodeResourcesResponse,
EdgeNodeSelfResponse,
EdgeStatusResponse,
NODE_PROTOCOL_VERSION,
)
from app.schemas.runtime import EdgeStartBotRequest
class EdgeRuntimeService:
def __init__(self) -> None:
self._runtime_backends: dict[str, EdgeRuntimeBackend] = {
str(kind).strip().lower(): backend
for kind, backend in build_edge_runtime_backends().items()
if isinstance(kind, str)
}
self._recent_packets: dict[str, list[dict]] = {}
self._packet_counters: dict[str, int] = {}
self._backfilled_bots: set[str] = set()
def _runtime_kind(self) -> str:
return preferred_edge_runtime_kind(self._runtime_backends)
def capabilities(self) -> dict:
caps: dict = {"protocol": {"version": NODE_PROTOCOL_VERSION}}
runtime_caps: dict[str, bool] = {}
workspace_caps: dict[str, bool] = {}
monitor_caps: dict[str, bool] = {}
process_caps: dict[str, object] = {}
for backend in self._runtime_backends.values():
current = dict(backend.capabilities() if hasattr(backend, "capabilities") else {})
for key, value in dict(current.get("runtime") or {}).items():
normalized = str(key or "").strip().lower()
if not normalized:
continue
runtime_caps[normalized] = bool(runtime_caps.get(normalized) or value is True)
for key, value in dict(current.get("workspace") or {}).items():
normalized = str(key or "").strip()
if not normalized:
continue
workspace_caps[normalized] = bool(workspace_caps.get(normalized) or value is True)
for key, value in dict(current.get("monitor") or {}).items():
normalized = str(key or "").strip()
if not normalized:
continue
monitor_caps[normalized] = bool(monitor_caps.get(normalized) or value is True)
for key, value in dict(current.get("process") or {}).items():
normalized = str(key or "").strip()
if normalized:
process_caps[normalized] = value
if runtime_caps:
caps["runtime"] = runtime_caps
if workspace_caps:
caps["workspace"] = workspace_caps
if monitor_caps:
caps["monitor"] = monitor_caps
if process_caps:
caps["process"] = process_caps
return caps
async def start_bot(self, *, bot_id: str, payload: EdgeStartBotRequest) -> EdgeStatusResponse:
runtime_kind = self._resolve_runtime_kind(bot_id, preferred=payload.runtime_kind)
backend = self._backend_for_bot(bot_id, preferred=runtime_kind)
self._write_runtime_target(
bot_id=bot_id,
runtime_kind=runtime_kind,
workspace_root=str(payload.workspace_root or "").strip() or None,
)
success = backend.start_bot(
bot_id=bot_id,
image_tag=str(payload.image_tag or "").strip(),
env_vars=dict(payload.env_vars or {}),
workspace_root=str(payload.workspace_root or "").strip() or None,
native_command=str(payload.native_command or "").strip() or None,
native_workdir=str(payload.native_workdir or "").strip() or None,
cpu_cores=float(payload.cpu_cores),
memory_mb=int(payload.memory_mb),
storage_gb=int(payload.storage_gb),
on_state_change=self._record_monitor_packet,
)
if not success:
detail = backend.get_last_delivery_error(bot_id) or f"Failed to start bot {bot_id} on dashboard-edge"
raise HTTPException(status_code=500, detail=detail)
return EdgeStatusResponse(status="started")
def stop_bot(self, *, bot_id: str) -> EdgeStatusResponse:
resolved_kind = self._resolve_runtime_kind(bot_id)
ordered_kinds: list[str] = []
if resolved_kind:
ordered_kinds.append(resolved_kind)
for kind in self._runtime_backends.keys():
if kind not in ordered_kinds:
ordered_kinds.append(kind)
for kind in ordered_kinds:
backend = self._runtime_backends.get(kind)
if backend is None:
continue
try:
backend.stop_bot(bot_id)
except Exception:
continue
return EdgeStatusResponse(status="stopped")
def send_command(self, *, bot_id: str, payload: EdgeCommandRequest) -> EdgeStatusResponse:
backend = self._backend_for_bot(bot_id)
ok = backend.send_command(bot_id, payload.command, media=list(payload.media or []))
if not ok:
detail = backend.get_last_delivery_error(bot_id) or "command delivery failed"
raise HTTPException(status_code=502, detail=detail)
return EdgeStatusResponse(status="ok")
def ensure_monitor(self, *, bot_id: str) -> EdgeMonitorEnsureResponse:
backend = self._backend_for_bot(bot_id)
ensured = backend.ensure_monitor(bot_id, self._record_monitor_packet)
return EdgeMonitorEnsureResponse(ensured=bool(ensured))
def get_recent_logs(self, *, bot_id: str, tail: int) -> EdgeLogsResponse:
backend = self._backend_for_bot(bot_id)
return EdgeLogsResponse(bot_id=bot_id, logs=backend.get_recent_logs(bot_id, tail=tail))
def get_monitor_packets(self, *, bot_id: str, after_seq: int = 0, limit: int = 200) -> EdgeMonitorPacketsResponse:
self._backfill_monitor_packets(bot_id=bot_id)
rows = [
dict(row)
for row in self._recent_packets.get(bot_id, [])
if int(row.get("seq") or 0) > max(0, int(after_seq or 0))
]
rows.sort(key=lambda row: int(row.get("seq") or 0))
if limit > 0:
rows = rows[: int(limit)]
latest_seq = int(self._packet_counters.get(bot_id, 0) or 0)
return EdgeMonitorPacketsResponse(
protocol_version=NODE_PROTOCOL_VERSION,
node_id=EDGE_NODE_ID,
bot_id=bot_id,
latest_seq=latest_seq,
packets=[
EdgeMonitorPacket.model_validate(
{
"protocol_version": NODE_PROTOCOL_VERSION,
"node_id": EDGE_NODE_ID,
"bot_id": bot_id,
**row,
}
)
for row in rows
],
)
def get_runtime_status(self, *, bot_id: str) -> EdgeStatusResponse:
backend = self._backend_for_bot(bot_id)
return EdgeStatusResponse(status=backend.get_bot_status(bot_id))
def get_resource_snapshot(self, *, bot_id: str) -> dict:
backend = self._backend_for_bot(bot_id)
snapshot = dict(backend.get_bot_resource_snapshot(bot_id) or {})
snapshot.setdefault("runtime_kind", self._resolve_runtime_kind(bot_id))
return snapshot
def get_node_identity(self) -> EdgeNodeSelfResponse:
resources = self.get_node_resource_summary()
return EdgeNodeSelfResponse(
protocol_version=resources.protocol_version,
node_id=EDGE_NODE_ID,
display_name=EDGE_NODE_NAME,
service="dashboard-edge",
transport_kind="edge",
runtime_kind=self._runtime_kind(),
core_adapter="nanobot",
capabilities=self.capabilities(),
resources=dict(resources.resources or {}),
reported_at=resources.reported_at,
)
def get_node_resource_summary(self) -> EdgeNodeResourcesResponse:
cpu_percent = 0.0
try:
cpu_percent = float(psutil.cpu_percent(interval=None) or 0.0)
except Exception:
cpu_percent = 0.0
memory_total = 0
memory_used = 0
try:
memory = psutil.virtual_memory()
memory_total = int(getattr(memory, "total", 0) or 0)
memory_used = int(getattr(memory, "used", 0) or 0)
except Exception:
memory_total = 0
memory_used = 0
workspace_limit = 0
workspace_used = 0
try:
disk = psutil.disk_usage(EDGE_BOTS_WORKSPACE_ROOT)
workspace_limit = int(getattr(disk, "total", 0) or 0)
workspace_used = int(getattr(disk, "used", 0) or 0)
except Exception:
workspace_limit = 0
workspace_used = self._calc_workspace_used_bytes()
cpu_cores = 0.0
try:
cpu_cores = float(psutil.cpu_count(logical=True) or 0)
except Exception:
cpu_cores = 0.0
return EdgeNodeResourcesResponse(
protocol_version=NODE_PROTOCOL_VERSION,
node_id=EDGE_NODE_ID,
display_name=EDGE_NODE_NAME,
transport_kind="edge",
runtime_kind=self._runtime_kind(),
core_adapter="nanobot",
resources={
"configured_cpu_cores": round(cpu_cores, 2),
"configured_memory_bytes": memory_total,
"configured_storage_bytes": workspace_limit,
"live_cpu_percent": round(cpu_percent, 2),
"live_memory_used_bytes": memory_used,
"live_memory_limit_bytes": memory_total,
"workspace_used_bytes": workspace_used,
"workspace_limit_bytes": workspace_limit,
},
reported_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
)
def heartbeat(self) -> EdgeNodeHeartbeatResponse:
node_resources = self.get_node_resource_summary()
return EdgeNodeHeartbeatResponse(
protocol_version=NODE_PROTOCOL_VERSION,
node_id=EDGE_NODE_ID,
display_name=EDGE_NODE_NAME,
service="dashboard-edge",
transport_kind="edge",
runtime_kind=self._runtime_kind(),
core_adapter="nanobot",
capabilities=self.capabilities(),
resources=dict(node_resources.resources or {}),
reported_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
)
def native_preflight(self, *, native_command: str | None = None, native_workdir: str | None = None) -> dict:
raw_command = str(native_command or "").strip()
command_parts: list[str] = []
parse_error = ""
if raw_command:
command_parts, parse_error = self._parse_native_command(raw_command)
if not raw_command and not command_parts:
backend = self._runtime_backends.get("native")
process_caps = {}
if backend is not None:
process_caps = dict((backend.capabilities() or {}).get("process") or {})
command_parts = [str(item or "").strip() for item in list(process_caps.get("command") or []) if str(item or "").strip()]
command_available = bool(command_parts and shutil.which(command_parts[0]))
configured_workdir = str(native_workdir or "").strip()
if configured_workdir:
workdir = os.path.abspath(configured_workdir)
workdir_exists = os.path.isdir(workdir)
else:
workdir = ""
workdir_exists = True
ok = bool(command_available and workdir_exists)
detail_parts: list[str] = []
if not command_available:
detail_parts.append("native command not available")
if not workdir_exists:
detail_parts.append("native workdir does not exist")
if parse_error:
detail_parts.append(parse_error)
if not detail_parts:
detail_parts.append("native launcher ready")
return {
"ok": ok,
"command": command_parts,
"workdir": workdir,
"command_available": command_available,
"workdir_exists": workdir_exists,
"detail": "; ".join(detail_parts),
}
@staticmethod
def _parse_native_command(raw_command: str) -> tuple[list[str], str]:
text = str(raw_command or "").strip()
if not text:
return [], ""
if text.startswith("[") and text.endswith("]"):
try:
payload = json.loads(text)
if isinstance(payload, list):
rows = [str(item or "").strip() for item in payload if str(item or "").strip()]
if rows:
return rows, ""
return [], "native command JSON list is empty"
except Exception:
return [], "native command JSON is invalid"
if "," in text and any(mark in text for mark in ['"', "'"]):
try:
rows = [str(item or "").strip() for item in next(csv.reader([text], skipinitialspace=True)) if str(item or "").strip()]
if rows:
return rows, ""
except Exception:
pass
try:
rows = [str(item or "").strip() for item in shlex.split(text) if str(item or "").strip()]
if rows:
return rows, ""
return [], "native command is empty"
except Exception:
return [], "native command format is invalid"
def _record_monitor_packet(self, bot_id: str, packet: dict) -> None:
rows = self._recent_packets.setdefault(bot_id, [])
next_seq = int(self._packet_counters.get(bot_id, 0) or 0) + 1
self._packet_counters[bot_id] = next_seq
captured_at = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
rows.append(
{
"protocol_version": NODE_PROTOCOL_VERSION,
"node_id": EDGE_NODE_ID,
"bot_id": bot_id,
"seq": next_seq,
"captured_at": captured_at,
"packet": dict(packet or {}),
}
)
if len(rows) > 200:
del rows[:-200]
def _backfill_monitor_packets(self, bot_id: str) -> None:
if bot_id in self._backfilled_bots:
return
self._backfilled_bots.add(bot_id)
backend = self._backend_for_bot(bot_id)
for line in backend.get_recent_logs(bot_id, tail=500):
packet = backend.parse_monitor_packet(line)
if packet:
self._record_monitor_packet(bot_id, packet)
def _backend_for_bot(self, bot_id: str, preferred: str | None = None) -> EdgeRuntimeBackend:
runtime_kind = self._resolve_runtime_kind(bot_id, preferred=preferred)
backend = self._runtime_backends.get(runtime_kind)
if backend is None:
raise HTTPException(status_code=501, detail=f"dashboard-edge runtime is not available: {runtime_kind}")
return backend
def _resolve_runtime_kind(self, bot_id: str, preferred: str | None = None) -> str:
normalized_preferred = self._normalize_runtime_kind(preferred, allow_empty=True)
if normalized_preferred and normalized_preferred in self._runtime_backends:
return normalized_preferred
persisted = self._normalize_runtime_kind(self._read_runtime_target(bot_id), allow_empty=True)
if persisted and persisted in self._runtime_backends:
return persisted
for runtime_kind, backend in self._runtime_backends.items():
try:
if str(backend.get_bot_status(bot_id) or "").strip().upper() == "RUNNING":
return runtime_kind
except Exception:
continue
return self._runtime_kind()
@staticmethod
def _normalize_runtime_kind(value: str | None, *, allow_empty: bool = False) -> str:
text = str(value or "").strip().lower()
if allow_empty and not text:
return ""
return text if text in {"docker", "native"} else "docker"
@staticmethod
def _runtime_target_path(bot_id: str) -> str:
return os.path.join(EDGE_BOTS_WORKSPACE_ROOT, str(bot_id or "").strip(), ".nanobot", "runtime-target.json")
@staticmethod
def _config_path(bot_id: str) -> str:
return os.path.join(EDGE_BOTS_WORKSPACE_ROOT, str(bot_id or "").strip(), ".nanobot", "config.json")
def _read_runtime_target(self, bot_id: str) -> str:
payload = self._read_runtime_target_payload(bot_id)
if isinstance(payload, dict):
return str(payload.get("runtime_kind") or "").strip().lower()
return ""
def _read_runtime_target_payload(self, bot_id: str) -> dict:
for path in self._runtime_target_paths_for_read(bot_id):
if not os.path.isfile(path):
continue
try:
with open(path, "r", encoding="utf-8") as fh:
payload = json.load(fh)
if isinstance(payload, dict):
return payload
except Exception:
continue
return {}
def _write_runtime_target(self, *, bot_id: str, runtime_kind: str, workspace_root: str | None = None) -> None:
payload = dict(self._read_runtime_target_payload(bot_id))
payload["runtime_kind"] = self._normalize_runtime_kind(runtime_kind)
if workspace_root is not None:
normalized_root = str(workspace_root or "").strip()
if normalized_root:
payload["workspace_root"] = os.path.abspath(os.path.expanduser(normalized_root))
else:
payload.pop("workspace_root", None)
paths = self._runtime_target_paths(bot_id=bot_id, payload=payload)
for path in paths:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as fh:
json.dump(payload, fh, ensure_ascii=False, indent=2)
primary = self._runtime_target_path(bot_id)
if primary not in paths and os.path.isfile(primary):
try:
os.remove(primary)
except Exception:
pass
def _runtime_target_paths(self, *, bot_id: str, payload: dict) -> list[str]:
primary = self._runtime_target_path(bot_id)
workspace_root = str(payload.get("workspace_root") or "").strip()
if workspace_root:
external = os.path.join(
os.path.abspath(os.path.expanduser(workspace_root)),
str(bot_id or "").strip(),
".nanobot",
"runtime-target.json",
)
if os.path.abspath(external) != os.path.abspath(primary):
return [external]
return [primary]
def _runtime_target_paths_for_read(self, bot_id: str) -> list[str]:
primary = self._runtime_target_path(bot_id)
rows: list[str] = [primary]
workspace_root = self._workspace_root_from_config(bot_id)
if workspace_root:
external = os.path.join(
workspace_root,
str(bot_id or "").strip(),
".nanobot",
"runtime-target.json",
)
if os.path.abspath(external) != os.path.abspath(primary):
rows.insert(0, external)
return rows
def _workspace_root_from_config(self, bot_id: str) -> str:
path = self._config_path(bot_id)
if not os.path.isfile(path):
return ""
try:
with open(path, "r", encoding="utf-8") as fh:
payload = json.load(fh)
if not isinstance(payload, dict):
return ""
agents = payload.get("agents")
if not isinstance(agents, dict):
return ""
defaults = agents.get("defaults")
if not isinstance(defaults, dict):
return ""
workspace = str(defaults.get("workspace") or "").strip()
if not workspace:
return ""
normalized_workspace = os.path.abspath(os.path.expanduser(workspace))
marker = f"{os.sep}{str(bot_id or '').strip()}{os.sep}.nanobot{os.sep}workspace"
if marker in normalized_workspace:
return normalized_workspace.rsplit(marker, 1)[0]
except Exception:
return ""
return ""
@staticmethod
def _calc_workspace_used_bytes() -> int:
total = 0
for root, _, files in os.walk(EDGE_BOTS_WORKSPACE_ROOT):
for filename in files:
path = os.path.join(root, filename)
try:
total += int(os.path.getsize(path))
except Exception:
continue
return total
edge_runtime_service = EdgeRuntimeService()