import codecs import csv import hashlib import json import signal import socket import os import re import shlex import shutil import subprocess import threading import time from dataclasses import dataclass, field from datetime import datetime, timezone from typing import Any, Callable, Dict, List, Optional import httpx import psutil from app.core.settings import ( EDGE_BOTS_WORKSPACE_ROOT, EDGE_NATIVE_COMMAND, EDGE_NATIVE_DASHBOARD_URL, EDGE_NATIVE_WORKDIR, ) from app.runtime.base import EdgeRuntimeBackend @dataclass class _NativeProcessRecord: process: subprocess.Popen[str] command: List[str] cwd: str log_path: str log_handle: Any dashboard_url: str dashboard_host: str dashboard_port: int cpu_cores: Optional[float] memory_mb: Optional[int] storage_gb: Optional[int] stop_event: threading.Event = field(default_factory=threading.Event) stdout_thread: Optional[threading.Thread] = None last_error: str = "" class EdgeNativeRuntimeBackend(EdgeRuntimeBackend): runtime_kind = "native" def __init__(self) -> None: self._command = shlex.split(EDGE_NATIVE_COMMAND) self._native_available = bool(self._command and shutil.which(self._command[0])) self._last_errors: Dict[str, str] = {} self._records: Dict[str, _NativeProcessRecord] = {} self._lock = threading.RLock() def capabilities(self) -> Dict[str, Any]: available = bool(self._native_available) return { "protocol": {"version": "1"}, "runtime": {"docker": False, "native": available}, "workspace": { "tree": True, "read_file": True, "write_markdown": True, "upload_files": True, "serve_file": True, }, "monitor": {"logs": available, "ensure": available}, "process": {"command": list(self._command), "available": available}, } def has_image(self, tag: str) -> bool: return False def start_bot( self, bot_id: str, image_tag: Optional[str] = None, env_vars: Optional[Dict[str, str]] = None, workspace_root: Optional[str] = None, native_command: Optional[str] = None, native_workdir: Optional[str] = None, cpu_cores: Optional[float] = None, memory_mb: Optional[int] = None, storage_gb: Optional[int] = None, on_state_change: Optional[Callable[[str, dict], None]] = None, ) -> bool: bot_id = str(bot_id or "").strip() if not bot_id: return False effective_env = dict(env_vars or {}) launch_command = self._resolve_launch_command(native_command=native_command, env_vars=effective_env) if not self._is_launch_command_available(launch_command): self._set_last_error(bot_id, f"native command not available: {self._render_command(launch_command) or 'nanobot gateway'}") return False with self._lock: existing = self._records.get(bot_id) if existing and existing.process.poll() is None: if on_state_change: self.ensure_monitor(bot_id, on_state_change) return True if existing: self._cleanup_record(bot_id, existing) state_root = self._bot_root(bot_id) workspace_dir = self._workspace_dir(bot_id=bot_id, workspace_root=workspace_root) config_path = self._config_path(bot_id, workspace_root=workspace_root) runtime_dir = os.path.join(os.path.dirname(config_path), "runtime") os.makedirs(runtime_dir, exist_ok=True) os.makedirs(workspace_dir, exist_ok=True) log_path = os.path.join(runtime_dir, "native.log") cwd = self._resolve_workdir(state_root, native_workdir=native_workdir, env_vars=effective_env) dashboard_host, dashboard_port, dashboard_url = self._resolve_dashboard_endpoint(bot_id, effective_env) env = os.environ.copy() env.update({str(k): str(v) for k, v in effective_env.items() if str(k).strip()}) env.setdefault("PYTHONUNBUFFERED", "1") env.setdefault("EDGE_RUNTIME_KIND", "native") env.setdefault("EDGE_NODE_MODE", "native") env.setdefault("NANOBOT_BOT_ID", bot_id) env.setdefault("DASHBOARD_HOST", dashboard_host) env.setdefault("DASHBOARD_PORT", str(dashboard_port)) env.setdefault("DASHBOARD_URL", dashboard_url) env.setdefault("NANOBOT_CONFIG", config_path) env.setdefault("NANOBOT_WORKSPACE", workspace_dir) if not os.path.isfile(config_path): self._set_last_error(bot_id, f"native config not found: {config_path}") return False self._terminate_orphan_processes(bot_id=bot_id, config_path=config_path) log_handle = open(log_path, "a", encoding="utf-8") command = self._build_launch_command(base_command=launch_command, config_path=config_path, workspace_dir=workspace_dir) log_handle.write( f"[{self._now()}] native bootstrap command={shlex.join(command)} cwd={cwd} config={config_path} workspace={workspace_dir} dashboard={dashboard_url}\n" ) log_handle.flush() try: process = subprocess.Popen( command, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, start_new_session=True, ) except FileNotFoundError as exc: log_handle.write(f"[{self._now()}] native bootstrap failed: {exc}\n") log_handle.flush() log_handle.close() self._set_last_error(bot_id, f"native command not found: {exc}") return False except Exception as exc: log_handle.write(f"[{self._now()}] native bootstrap failed: {exc}\n") log_handle.flush() log_handle.close() self._set_last_error(bot_id, f"native start failed: {exc}") return False record = _NativeProcessRecord( process=process, command=command, cwd=cwd, log_path=log_path, log_handle=log_handle, dashboard_url=dashboard_url, dashboard_host=dashboard_host, dashboard_port=dashboard_port, cpu_cores=cpu_cores, memory_mb=memory_mb, storage_gb=storage_gb, ) self._records[bot_id] = record record.stdout_thread = threading.Thread( target=self._drain_stdout, args=(bot_id, record, on_state_change), daemon=True, ) record.stdout_thread.start() if not self._wait_for_dashboard_ready(record): self._set_last_error(bot_id, f"native dashboard did not become ready: {dashboard_url}") try: if process.poll() is None: process.terminate() process.wait(timeout=5) except Exception: pass self._cleanup_record(bot_id, record) self._records.pop(bot_id, None) return False self._set_last_error(bot_id, "") return True def ensure_monitor(self, bot_id: str, on_state_change: Callable[[str, dict], None]) -> bool: record = self._records.get(bot_id) if record is None or record.process.poll() is not None: return False thread = record.stdout_thread if thread is not None and thread.is_alive(): return True record.stdout_thread = threading.Thread( target=self._drain_stdout, args=(bot_id, record, on_state_change), daemon=True, ) record.stdout_thread.start() return True def stop_bot(self, bot_id: str) -> bool: bot_id = str(bot_id or "").strip() with self._lock: record = self._records.pop(bot_id, None) stopped = False if record is not None: try: if record.process.poll() is None: record.stop_event.set() record.process.terminate() try: record.process.wait(timeout=8) except Exception: record.process.kill() record.process.wait(timeout=5) self._cleanup_record(bot_id, record) stopped = True except Exception as exc: self._set_last_error(bot_id, f"native stop failed: {exc}") self._cleanup_record(bot_id, record) return False orphan_stopped = self._terminate_orphan_processes(bot_id=bot_id, config_path=self._config_path(bot_id)) return bool(stopped or orphan_stopped) def get_bot_status(self, bot_id: str) -> str: normalized_bot_id = str(bot_id or "").strip() record = self._records.get(normalized_bot_id) if record is None: return "RUNNING" if self._has_orphan_process(normalized_bot_id) else "STOPPED" try: return "RUNNING" if record.process.poll() is None else "STOPPED" except Exception: return "STOPPED" def get_bot_resource_snapshot(self, bot_id: str) -> Dict[str, Any]: bot_id = str(bot_id or "").strip() record = self._records.get(bot_id) snapshot: Dict[str, Any] = { "docker_status": self.get_bot_status(bot_id), "limits": { "cpu_cores": self._normalize_cpu_limit(record.cpu_cores if record else None), "memory_bytes": self._normalize_memory_limit(record.memory_mb if record else None), "storage_bytes": self._normalize_storage_limit(record.storage_gb if record else None), "nano_cpus": 0, "storage_opt_raw": "", }, "usage": { "cpu_percent": 0.0, "memory_bytes": 0, "memory_limit_bytes": 0, "memory_percent": 0.0, "network_rx_bytes": 0, "network_tx_bytes": 0, "blk_read_bytes": 0, "blk_write_bytes": 0, "pids": 0, "container_rw_bytes": 0, }, } if record is None or record.process.poll() is not None: return snapshot try: proc = psutil.Process(record.process.pid) cpu_percent = float(proc.cpu_percent(interval=None) or 0.0) memory_info = proc.memory_info() memory_bytes = int(getattr(memory_info, "rss", 0) or 0) memory_limit = int(psutil.virtual_memory().total or 0) memory_percent = float(proc.memory_percent() or 0.0) children = proc.children(recursive=True) workspace_used = self._calc_workspace_used_bytes(bot_id) snapshot["usage"].update( { "cpu_percent": round(cpu_percent, 2), "memory_bytes": memory_bytes, "memory_limit_bytes": memory_limit, "memory_percent": round(memory_percent, 2), "network_rx_bytes": 0, "network_tx_bytes": 0, "blk_read_bytes": 0, "blk_write_bytes": 0, "pids": 1 + len(children), "container_rw_bytes": workspace_used, } ) except Exception: workspace_used = self._calc_workspace_used_bytes(bot_id) snapshot["usage"]["container_rw_bytes"] = workspace_used return snapshot def get_recent_logs(self, bot_id: str, tail: int = 300) -> List[str]: log_path = self._log_path(str(bot_id or "").strip()) if not os.path.isfile(log_path): return [] try: with open(log_path, "r", encoding="utf-8", errors="ignore") as fh: rows = [line.rstrip("\n") for line in fh.readlines() if line.strip()] if tail > 0: return rows[-int(tail) :] return rows except Exception: return [] def send_command(self, bot_id: str, command: str, media: Optional[List[str]] = None) -> bool: bot_id = str(bot_id or "").strip() record = self._records.get(bot_id) if record is None or record.process.poll() is not None: self._set_last_error(bot_id, "native process is not running") return False try: payload = {"message": command, "media": list(media or [])} with httpx.Client(timeout=5.0, trust_env=False) as client: resp = client.post(record.dashboard_url, json=payload) if resp.status_code == 200: self._set_last_error(bot_id, "") return True self._set_last_error(bot_id, f"native dashboard returned {resp.status_code}: {resp.text[:300]}") return False except Exception as exc: self._set_last_error(bot_id, f"native dashboard request failed: {exc}") return False def get_last_delivery_error(self, bot_id: str) -> str: bot_id = str(bot_id or "").strip() record = self._records.get(bot_id) if record is None: return str(self._last_errors.get(bot_id) or "").strip() return str(record.last_error or self._last_errors.get(bot_id) or "").strip() def parse_monitor_packet(self, line: str) -> Optional[Dict[str, Any]]: return self._parse_log_line(str(line or "").strip()) def _drain_stdout( self, bot_id: str, record: _NativeProcessRecord, callback: Optional[Callable[[str, dict], None]] = None, ) -> None: stream = record.process.stdout if stream is None: return try: for raw_line in iter(stream.readline, ""): if record.stop_event.is_set(): break line = str(raw_line or "").rstrip("\r\n") if not line: continue try: record.log_handle.write(f"{line}\n") record.log_handle.flush() except Exception: pass if callback: parsed = self._parse_log_line(line) if parsed: callback(bot_id, parsed) callback(bot_id, {"type": "RAW_LOG", "text": line}) finally: try: stream.close() except Exception: pass try: record.log_handle.flush() except Exception: pass def _cleanup_record(self, bot_id: str, record: _NativeProcessRecord) -> None: try: record.stop_event.set() except Exception: pass try: if record.log_handle and not record.log_handle.closed: record.log_handle.flush() record.log_handle.close() except Exception: pass def _set_last_error(self, bot_id: str, message: str) -> None: normalized_bot_id = str(bot_id or "").strip() self._last_errors[normalized_bot_id] = str(message or "").strip() record = self._records.get(normalized_bot_id) if record is None: return record.last_error = self._last_errors[normalized_bot_id] def _resolve_workdir( self, bot_root: str, *, native_workdir: Optional[str] = None, env_vars: Optional[Dict[str, str]] = None, ) -> str: configured = str(native_workdir or (env_vars or {}).get("EDGE_NATIVE_WORKDIR") or EDGE_NATIVE_WORKDIR or "").strip() if configured: return os.path.abspath(configured) return os.path.abspath(bot_root) def _resolve_dashboard_endpoint(self, bot_id: str, env_vars: Dict[str, str]) -> tuple[str, int, str]: host = str(env_vars.get("DASHBOARD_HOST") or os.getenv("EDGE_NATIVE_DASHBOARD_HOST") or "127.0.0.1").strip() or "127.0.0.1" raw_port = str(env_vars.get("DASHBOARD_PORT") or os.getenv("EDGE_NATIVE_DASHBOARD_PORT") or "").strip() try: port = int(raw_port) if raw_port else self._default_dashboard_port(bot_id) except Exception: port = self._default_dashboard_port(bot_id) port = max(1, min(port, 65535)) url = str(env_vars.get("DASHBOARD_URL") or os.getenv("EDGE_NATIVE_DASHBOARD_URL") or f"http://{host}:{port}/chat").strip() if not url: url = f"http://{host}:{port}/chat" return host, port, url def _build_launch_command(self, *, base_command: List[str], config_path: str, workspace_dir: str) -> List[str]: command = list(base_command) has_config_flag = any(part in {"--config", "-c"} for part in command) has_workspace_flag = any(part in {"--workspace", "-w"} for part in command) if not has_config_flag: command.extend(["--config", config_path]) if not has_workspace_flag: command.extend(["--workspace", workspace_dir]) return command def _resolve_launch_command(self, *, native_command: Optional[str], env_vars: Dict[str, str]) -> List[str]: explicit = str(native_command or "").strip() if explicit: return self._parse_launcher_command(explicit) configured = str(env_vars.get("EDGE_NATIVE_COMMAND") or "").strip() if configured: rows = self._parse_launcher_command(configured) if rows: return rows return list(self._command) @staticmethod def _parse_launcher_command(raw_command: str) -> List[str]: text = str(raw_command or "").strip() if not text: return [] if text.startswith("[") and text.endswith("]"): try: payload = json.loads(text) if isinstance(payload, list): rows = [str(item or "").strip() for item in payload if str(item or "").strip()] if rows: return rows except Exception: pass if "," in text and any(mark in text for mark in ['"', "'"]): try: rows = [str(item or "").strip() for item in next(csv.reader([text], skipinitialspace=True)) if str(item or "").strip()] if rows: return rows except Exception: pass try: return [str(item or "").strip() for item in shlex.split(text) if str(item or "").strip()] except Exception: return [] @staticmethod def _is_launch_command_available(command: List[str]) -> bool: if not command: return False return bool(shutil.which(command[0])) @staticmethod def _render_command(command: List[str]) -> str: return " ".join(str(part or "").strip() for part in command if str(part or "").strip()) def _log_path(self, bot_id: str) -> str: config_path = self._config_path(bot_id) return os.path.join(os.path.dirname(config_path), "runtime", "native.log") def _config_path(self, bot_id: str, workspace_root: Optional[str] = None) -> str: configured_root = str(workspace_root or "").strip() if configured_root: external_config = os.path.abspath( os.path.join( os.path.abspath(os.path.expanduser(configured_root)), bot_id, ".nanobot", "config.json", ) ) if os.path.isfile(external_config): return external_config inferred_root = self._workspace_root_from_runtime_target(bot_id) if inferred_root: inferred_config = os.path.abspath(os.path.join(inferred_root, bot_id, ".nanobot", "config.json")) if os.path.isfile(inferred_config): return inferred_config return os.path.join(self._bot_root(bot_id), ".nanobot", "config.json") def _bot_root(self, bot_id: str) -> str: return os.path.abspath(os.path.join(EDGE_BOTS_WORKSPACE_ROOT, bot_id)) def _workspace_dir(self, *, bot_id: str, workspace_root: Optional[str] = None) -> str: configured_root = str(workspace_root or "").strip() if configured_root: normalized_root = os.path.abspath(os.path.expanduser(configured_root)) return os.path.abspath(os.path.join(normalized_root, bot_id, ".nanobot", "workspace")) config_workspace = self._workspace_dir_from_config(bot_id) if config_workspace: return config_workspace return os.path.abspath(os.path.join(self._bot_root(bot_id), ".nanobot", "workspace")) def _workspace_dir_from_config(self, bot_id: str) -> Optional[str]: config_path = self._config_path(bot_id) if not os.path.isfile(config_path): return None try: with open(config_path, "r", encoding="utf-8") as fh: payload = json.load(fh) if not isinstance(payload, dict): return None agents = payload.get("agents") if isinstance(payload.get("agents"), dict) else {} defaults = agents.get("defaults") if isinstance(agents.get("defaults"), dict) else {} workspace = str(defaults.get("workspace") or "").strip() if not workspace: return None return os.path.abspath(os.path.expanduser(workspace)) except Exception: return None def _workspace_root_from_runtime_target(self, bot_id: str) -> str: path = os.path.join(self._bot_root(bot_id), ".nanobot", "runtime-target.json") if not os.path.isfile(path): return "" try: with open(path, "r", encoding="utf-8") as fh: payload = json.load(fh) if not isinstance(payload, dict): return "" raw_root = str(payload.get("workspace_root") or "").strip() if not raw_root: return "" return os.path.abspath(os.path.expanduser(raw_root)) except Exception: return "" def _has_orphan_process(self, bot_id: str) -> bool: return bool(self._find_orphan_processes(bot_id=bot_id, config_path=self._config_path(bot_id))) def _find_orphan_processes(self, *, bot_id: str, config_path: str) -> List[psutil.Process]: matches: List[psutil.Process] = [] normalized_config_path = os.path.abspath(config_path) for proc in psutil.process_iter(["pid", "cmdline"]): try: cmdline = [str(part or "") for part in (proc.info.get("cmdline") or [])] if not cmdline: continue joined = " ".join(cmdline) if "nanobot.cli.commands" not in joined or " gateway" not in joined: continue if normalized_config_path not in joined: continue matches.append(proc) except (psutil.NoSuchProcess, psutil.AccessDenied): continue except Exception: continue return matches def _terminate_orphan_processes(self, *, bot_id: str, config_path: str) -> int: stopped = 0 for proc in self._find_orphan_processes(bot_id=bot_id, config_path=config_path): try: os.kill(int(proc.pid), signal.SIGTERM) try: proc.wait(timeout=5) except psutil.TimeoutExpired: os.kill(int(proc.pid), signal.SIGKILL) proc.wait(timeout=3) stopped += 1 except (psutil.NoSuchProcess, ProcessLookupError): continue except Exception as exc: self._set_last_error(bot_id, f"failed to cleanup orphan native process: {exc}") return stopped @staticmethod def _wait_for_dashboard_ready(record: _NativeProcessRecord, timeout_seconds: float = 8.0) -> bool: deadline = time.monotonic() + max(1.0, float(timeout_seconds or 8.0)) while time.monotonic() < deadline: if record.process.poll() is not None: return False try: with socket.create_connection((record.dashboard_host, record.dashboard_port), timeout=0.5): return True except OSError: time.sleep(0.2) continue return False @staticmethod def _default_dashboard_port(bot_id: str) -> int: digest = hashlib.sha1(str(bot_id or "").strip().encode("utf-8")).hexdigest() return 19000 + (int(digest[:6], 16) % 2000) @staticmethod def _normalize_cpu_limit(value: Optional[float]) -> Optional[float]: if value is None: return None try: return round(float(value), 2) except Exception: return None @staticmethod def _normalize_memory_limit(value: Optional[int]) -> Optional[int]: if value is None: return None try: return max(0, int(value)) * 1024 * 1024 except Exception: return None @staticmethod def _normalize_storage_limit(value: Optional[int]) -> Optional[int]: if value is None: return None try: return max(0, int(value)) * 1024 * 1024 * 1024 except Exception: return None def _calc_workspace_used_bytes(self, bot_id: str) -> int: total = 0 root = self._workspace_dir(bot_id=bot_id) for current_root, _, files in os.walk(root): for filename in files: path = os.path.join(current_root, filename) try: total += int(os.path.getsize(path)) except Exception: continue return total @staticmethod def _now() -> str: return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") @staticmethod def _parse_monitor_packet_json(line: str) -> Optional[Dict[str, Any]]: if "__DASHBOARD_DATA_START__" not in line or "__DASHBOARD_DATA_END__" not in line: return None try: raw_json = line.split("__DASHBOARD_DATA_START__", 1)[1].split("__DASHBOARD_DATA_END__", 1)[0].strip() data = json.loads(raw_json) event_type = str(data.get("type", "")).upper() content = str(data.get("content") or data.get("text") or "").strip() media = [str(v).strip().replace("\\", "/") for v in (data.get("media") or []) if str(v).strip()] is_progress = bool(data.get("is_progress", False)) is_tool = bool(data.get("is_tool", False)) usage = data.get("usage") if isinstance(data.get("usage"), dict) else None request_id = str(data.get("request_id") or "").strip() or None provider = str(data.get("provider") or "").strip() or None model = str(data.get("model") or "").strip() or None if event_type == "AGENT_STATE": payload = data.get("payload") or {} state = str(payload.get("state") or data.get("state") or ("TOOL_CALL" if is_tool else "THINKING")) action_msg = str(payload.get("action_msg") or payload.get("msg") or content) return { "type": "AGENT_STATE", "channel": "dashboard", "payload": {"state": state, "action_msg": action_msg}, "request_id": request_id, } if event_type == "ASSISTANT_MESSAGE": if content or media: return { "type": "ASSISTANT_MESSAGE", "channel": "dashboard", "text": content, "media": media, "usage": usage, "request_id": request_id, "provider": provider, "model": model, } return None if event_type == "BUS_EVENT" or is_progress: return { "type": "BUS_EVENT", "channel": "dashboard", "content": content, "media": media, "is_progress": is_progress, "is_tool": is_tool, "usage": usage, "request_id": request_id, "provider": provider, "model": model, } if content or media: return { "type": "ASSISTANT_MESSAGE", "channel": "dashboard", "text": content, "media": media, "usage": usage, "request_id": request_id, "provider": provider, "model": model, } except Exception: return None return None @classmethod def _parse_log_line(cls, line: str) -> Optional[Dict[str, Any]]: if "__DASHBOARD_DATA_START__" in line: packet = cls._parse_monitor_packet_json(line) if packet: return packet process_match = re.search(r"Processing message from ([\w\-]+):[^:]+:\s*(.+)$", line) if process_match: channel = process_match.group(1).strip().lower() action_msg = process_match.group(2).strip() return { "type": "AGENT_STATE", "channel": channel, "payload": { "state": "THINKING", "action_msg": action_msg[:4000], }, } response_match = re.search(r"Response to ([\w\-]+):[^:]+:\s*(.+)$", line) if response_match: channel = response_match.group(1).strip().lower() action_msg = response_match.group(2).strip() return { "type": "AGENT_STATE", "channel": channel, "payload": { "state": "SUCCESS", "action_msg": action_msg[:4000], }, } tool_call_match = re.search(r"tool call:\s*(.+)$", line, re.IGNORECASE) if tool_call_match: return { "type": "AGENT_STATE", "payload": { "state": "TOOL_CALL", "action_msg": tool_call_match.group(1).strip()[:4000], }, } lower = line.lower() if "error" in lower or "traceback" in lower: return { "type": "AGENT_STATE", "payload": {"state": "ERROR", "action_msg": "执行异常,请检查日志"}, } return None