v0.1.4-p1

2026-03-12 01:20:57 +08:00 · 2026-03-12 01:20:57 +08:00 · 6795fedbfe
parent 590eae9f0c
commit 6795fedbfe
15 changed files with 853 additions and 21 deletions
--- a/.env.prod.example
+++ b/.env.prod.example
@ -43,3 +43,15 @@ PANEL_ACCESS_PASSWORD=change_me_panel_password
 # Max upload size for backend validation (MB)
 UPLOAD_MAX_MB=200
 # Local speech-to-text (Whisper via whisper.cpp model file)
 STT_ENABLED=true
 STT_MODEL=ggml-small-q8_0.bin
 STT_MODEL_DIR=${HOST_DATA_ROOT}/model
 STT_DEVICE=cpu
 STT_MAX_AUDIO_SECONDS=20
 STT_DEFAULT_LANGUAGE=zh
 STT_FORCE_SIMPLIFIED=true
 STT_AUDIO_PREPROCESS=true
 STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20
 STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文，英文单词、缩写、品牌名和数字保持原文，不要翻译。
--- a/README.md
+++ b/README.md
@ -104,6 +104,8 @@ graph TD
   - 配置绝对路径：
     - `HOST_DATA_ROOT`
     - `HOST_BOTS_WORKSPACE_ROOT`
   - 如启用本地语音识别，请将 Whisper `.bin` 模型文件放到 `${HOST_DATA_ROOT}/model/`
     并让 `STT_MODEL` 指向完整文件名，例如 `ggml-small-q8_0.bin`
   - 中国网络建议配置加速项：
     - `PIP_INDEX_URL`、`PIP_TRUSTED_HOST`
     - `NPM_REGISTRY`
@ -120,3 +122,4 @@ graph TD
 - 必须挂载 `/var/run/docker.sock`，否则后端无法操作 Bot 镜像与容器。
 - `HOST_BOTS_WORKSPACE_ROOT` 必须是宿主机绝对路径，并且在 `docker-compose.prod.yml` 中以“同路径”挂载到后端容器。
  原因：后端通过 Docker API 创建 Bot 容器时，使用的是宿主机可见的 bind 路径。
 - 语音识别当前基于 `pywhispercpp==1.3.1` + Whisper `.bin` 模型文件，不使用 `faster-whisper`。
--- a/backend/.env.example
+++ b/backend/.env.example
@ -27,6 +27,18 @@ PANEL_ACCESS_PASSWORD=
 # Max upload size for backend validation (MB)
 UPLOAD_MAX_MB=100
 # Local speech-to-text (Whisper via whisper.cpp model file)
 STT_ENABLED=true
 STT_MODEL=ggml-small-q8_0.bin
 STT_MODEL_DIR=../data/model
 STT_DEVICE=cpu
 STT_MAX_AUDIO_SECONDS=20
 STT_DEFAULT_LANGUAGE=zh
 STT_FORCE_SIMPLIFIED=true
 STT_AUDIO_PREPROCESS=true
 STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20
 STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文，英文单词、缩写、品牌名和数字保持原文，不要翻译。
 # Local backend server options (for `python3 main.py`)
 APP_HOST=0.0.0.0
 APP_PORT=8000
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@ -13,6 +13,9 @@ ARG PIP_TRUSTED_HOST=
 COPY backend/requirements.txt ./requirements.txt
 RUN if [ -n "${PIP_INDEX_URL}" ]; then pip config set global.index-url "${PIP_INDEX_URL}"; fi \
    && if [ -n "${PIP_TRUSTED_HOST}" ]; then pip config set global.trusted-host "${PIP_TRUSTED_HOST}"; fi \
    && apt-get update \
    && apt-get install -y --no-install-recommends ffmpeg \
    && rm -rf /var/lib/apt/lists/* \
    && pip install --upgrade pip \
    && pip install -r requirements.txt
--- a/backend/core/settings.py
+++ b/backend/core/settings.py
@ -48,6 +48,7 @@ def _normalize_dir_path(path_value: str) -> str:
    raw = str(path_value or "").strip()
    if not raw:
        return raw
    raw = os.path.expandvars(os.path.expanduser(raw))
    p = Path(raw)
    if p.is_absolute():
        return str(p)
@ -117,6 +118,26 @@ DATABASE_ENGINE: Final[str] = _database_engine(DATABASE_URL)
 DATABASE_URL_DISPLAY: Final[str] = _mask_database_url(DATABASE_URL)
 DATABASE_ECHO: Final[bool] = _env_bool("DATABASE_ECHO", True)
 UPLOAD_MAX_MB: Final[int] = _env_int("UPLOAD_MAX_MB", 100, 1, 2048)
 STT_ENABLED: Final[bool] = _env_bool("STT_ENABLED", True)
 STT_MODEL: Final[str] = str(os.getenv("STT_MODEL") or "ggml-small-q8_0.bin").strip()
 _DEFAULT_STT_MODEL_DIR: Final[Path] = (Path(DATA_ROOT) / "model").resolve()
 _configured_stt_model_dir = _normalize_dir_path(os.getenv("STT_MODEL_DIR", str(_DEFAULT_STT_MODEL_DIR)))
 if _configured_stt_model_dir and not Path(_configured_stt_model_dir).exists() and _DEFAULT_STT_MODEL_DIR.exists():
    STT_MODEL_DIR: Final[str] = str(_DEFAULT_STT_MODEL_DIR)
 else:
    STT_MODEL_DIR: Final[str] = _configured_stt_model_dir
 STT_DEVICE: Final[str] = str(os.getenv("STT_DEVICE") or "cpu").strip().lower() or "cpu"
 STT_MAX_AUDIO_SECONDS: Final[int] = _env_int("STT_MAX_AUDIO_SECONDS", 20, 5, 600)
 STT_DEFAULT_LANGUAGE: Final[str] = str(os.getenv("STT_DEFAULT_LANGUAGE") or "zh").strip().lower() or "zh"
 STT_FORCE_SIMPLIFIED: Final[bool] = _env_bool("STT_FORCE_SIMPLIFIED", True)
 STT_AUDIO_PREPROCESS: Final[bool] = _env_bool("STT_AUDIO_PREPROCESS", True)
 STT_AUDIO_FILTER: Final[str] = str(
    os.getenv("STT_AUDIO_FILTER") or "highpass=f=120,lowpass=f=7600,afftdn=nf=-20"
 ).strip()
 STT_INITIAL_PROMPT: Final[str] = str(
    os.getenv("STT_INITIAL_PROMPT")
    or "以下内容可能包含简体中文和英文术语。请优先输出简体中文，英文单词、缩写、品牌名和数字保持原文，不要翻译。"
 ).strip()
 REDIS_ENABLED: Final[bool] = _env_bool("REDIS_ENABLED", False)
 REDIS_URL: Final[str] = str(os.getenv("REDIS_URL") or "").strip()
--- a/backend/core/speech_service.py
+++ b/backend/core/speech_service.py
@ -0,0 +1,259 @@
 from __future__ import annotations
 import os
 import shutil
 import subprocess
 import tempfile
 import threading
 from pathlib import Path
 from typing import Any, Dict, Optional
 from core.settings import (
    STT_AUDIO_FILTER,
    STT_AUDIO_PREPROCESS,
    STT_DEVICE,
    STT_ENABLED,
    STT_FORCE_SIMPLIFIED,
    STT_INITIAL_PROMPT,
    STT_MAX_AUDIO_SECONDS,
    STT_MODEL,
    STT_MODEL_DIR,
 )
 class SpeechServiceError(RuntimeError):
    pass
 class SpeechDisabledError(SpeechServiceError):
    pass
 class SpeechDurationError(SpeechServiceError):
    pass
 class WhisperSpeechService:
    def __init__(self) -> None:
        self._model: Any = None
        self._model_source: str = ""
        self._backend: str = ""
        self._model_lock = threading.Lock()
    def _resolve_model_source(self) -> str:
        model = str(STT_MODEL or "").strip()
        model_dir = str(STT_MODEL_DIR or "").strip()
        if not model:
            raise SpeechServiceError(
                "STT_MODEL is empty. Please set the full model file name, e.g. ggml-samll-q8_0.bin."
            )
        # If STT_MODEL itself is an absolute/relative path, use it directly.
        if any(sep in model for sep in ("/", "\\")):
            direct = Path(model).expanduser()
            if not direct.exists() or not direct.is_file():
                raise SpeechServiceError(f"STT model file not found: {direct}")
            if direct.suffix.lower() != ".bin":
                raise SpeechServiceError(
                    "STT_MODEL must point to a whisper.cpp ggml .bin model file."
                )
            return str(direct.resolve())
        # Strict mode: only exact filename, no alias/auto detection.
        if Path(model).suffix.lower() != ".bin":
            raise SpeechServiceError(
                "STT_MODEL must be the exact model file name (with .bin), e.g. ggml-small-q8_0.bin."
            )
        if not model_dir:
            raise SpeechServiceError("STT_MODEL_DIR is empty.")
        root = Path(model_dir).expanduser()
        if not root.exists() or not root.is_dir():
            raise SpeechServiceError(f"STT_MODEL_DIR does not exist: {root}")
        candidate = root / model
        if not candidate.exists() or not candidate.is_file():
            raise SpeechServiceError(
                f"STT model file not found under STT_MODEL_DIR: {candidate}"
            )
        return str(candidate.resolve())
    def _load_model(self) -> Any:
        model_source = self._resolve_model_source()
        if self._model is not None and self._model_source == model_source:
            return self._model
        with self._model_lock:
            if self._model is not None and self._model_source == model_source:
                return self._model
            try:
                from pywhispercpp.model import Model  # type: ignore
            except Exception as exc:
                raise SpeechServiceError(
                    "pywhispercpp is not installed in the active backend environment. "
                    "Run pip install -r backend/requirements.txt or rebuild the backend image."
                ) from exc
            self._model = Model(
                model_source,
                print_realtime=False,
                print_progress=False,
            )
            self._backend = "pywhispercpp"
            self._model_source = model_source
            return self._model
    @staticmethod
    def _preprocess_audio(file_path: str) -> str:
        target = str(file_path or "").strip()
        if not STT_AUDIO_PREPROCESS or not target or not os.path.isfile(target):
            return target
        if shutil.which("ffmpeg") is None:
            return target
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav", prefix=".speech_clean_")
        tmp_path = tmp.name
        tmp.close()
        cmd = [
            "ffmpeg",
            "-y",
            "-i",
            target,
            "-vn",
            "-ac",
            "1",
            "-ar",
            "16000",
        ]
        audio_filter = str(STT_AUDIO_FILTER or "").strip()
        if audio_filter:
            cmd.extend(["-af", audio_filter])
        cmd.extend(["-c:a", "pcm_s16le", tmp_path])
        try:
            completed = subprocess.run(
                cmd,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
                check=False,
            )
            if completed.returncode != 0 or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
                if os.path.exists(tmp_path):
                    os.remove(tmp_path)
                return target
            return tmp_path
        except Exception:
            if os.path.exists(tmp_path):
                os.remove(tmp_path)
            return target
    @staticmethod
    def _probe_audio_duration_seconds(file_path: str) -> Optional[float]:
        try:
            import av  # type: ignore
            with av.open(file_path) as container:
                if container.duration is not None:
                    # container.duration is in av.time_base units.
                    return max(0.0, float(container.duration / av.time_base))
                for stream in container.streams:
                    if stream.type != "audio":
                        continue
                    if stream.duration is not None and stream.time_base is not None:
                        return max(0.0, float(stream.duration * stream.time_base))
        except Exception:
            return None
        return None
    @staticmethod
    def _normalize_text(text: str) -> str:
        content = str(text or "").strip()
        if not content or not STT_FORCE_SIMPLIFIED:
            return content
        try:
            from opencc_purepy import OpenCC  # type: ignore
            return str(OpenCC("t2s").convert(content) or "").strip() or content
        except Exception:
            return content
    @staticmethod
    def _filter_supported_transcribe_kwargs(model: Any, kwargs: Dict[str, Any]) -> Dict[str, Any]:
        try:
            available = set(model.get_params().keys())
        except Exception:
            return kwargs
        return {key: value for key, value in kwargs.items() if key in available}
    def transcribe_file(self, file_path: str, language: Optional[str] = None) -> Dict[str, Any]:
        if not STT_ENABLED:
            raise SpeechDisabledError("Speech-to-text is disabled")
        target = str(file_path or "").strip()
        if not target or not os.path.isfile(target):
            raise SpeechServiceError("Audio file not found")
        duration_seconds = self._probe_audio_duration_seconds(target)
        if duration_seconds is not None and duration_seconds > float(STT_MAX_AUDIO_SECONDS) + 0.3:
            raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
        prepared_target = self._preprocess_audio(target)
        try:
            model = self._load_model()
            lang = str(language or "").strip().lower()
            normalized_lang: Optional[str] = None
            if lang and lang not in {"auto", "null", "none"}:
                normalized_lang = lang
            max_end = 0.0
            detected_language = ""
            texts = []
            kwargs: Dict[str, Any] = {
                "print_realtime": False,
                "print_progress": False,
                "no_context": True,
                "suppress_non_speech_tokens": True,
            }
            if normalized_lang:
                kwargs["language"] = normalized_lang
            initial_prompt = str(STT_INITIAL_PROMPT or "").strip()
            if initial_prompt:
                kwargs["initial_prompt"] = initial_prompt
            kwargs = self._filter_supported_transcribe_kwargs(model, kwargs)
            try:
                segments = model.transcribe(prepared_target, **kwargs)
            except Exception as exc:
                raise SpeechServiceError(
                    f"pywhispercpp transcription failed: {exc}. "
                    "If input is not wav, install ffmpeg in runtime image."
                ) from exc
            for segment in segments:
                txt = str(getattr(segment, "text", "") or "").strip()
                if txt:
                    texts.append(txt)
                if normalized_lang:
                    detected_language = normalized_lang
                try:
                    max_end = max(max_end, float(getattr(segment, "t1", 0.0) or 0.0) / 100.0)
                except Exception:
                    pass
                if max_end > float(STT_MAX_AUDIO_SECONDS) + 0.3:
                    raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
            text = self._normalize_text(" ".join(texts).strip())
            if not text:
                raise SpeechServiceError("No speech detected")
            if duration_seconds is None:
                duration_seconds = max_end if max_end > 0 else None
            return {
                "text": text,
                "language": detected_language or None,
                "duration_seconds": duration_seconds,
                "max_audio_seconds": STT_MAX_AUDIO_SECONDS,
                "model": STT_MODEL,
                "device": STT_DEVICE,
                "backend": self._backend or "unknown",
            }
        finally:
            if prepared_target != target and os.path.exists(prepared_target):
                try:
                    os.remove(prepared_target)
                except Exception:
                    pass
--- a/backend/main.py
+++ b/backend/main.py
@ -1,5 +1,6 @@
 import asyncio
 import json
 import logging
 import mimetypes
 import os
 import re
@ -12,7 +13,7 @@ from urllib.parse import unquote
 import httpx
 from pydantic import BaseModel
-from fastapi import Depends, FastAPI, File, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
+from fastapi import Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
 from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from sqlmodel import Session, select
@ -21,6 +22,12 @@ from core.config_manager import BotConfigManager
 from core.cache import cache
 from core.database import engine, get_session, init_database
 from core.docker_manager import BotDockerManager
 from core.speech_service import (
    SpeechDisabledError,
    SpeechDurationError,
    SpeechServiceError,
    WhisperSpeechService,
 )
 from core.settings import (
    BOTS_WORKSPACE_ROOT,
    DATA_ROOT,
@ -37,11 +44,17 @@ from core.settings import (
    REDIS_ENABLED,
    REDIS_PREFIX,
    REDIS_URL,
    STT_DEVICE,
    STT_DEFAULT_LANGUAGE,
    STT_ENABLED,
    STT_MAX_AUDIO_SECONDS,
    STT_MODEL,
    UPLOAD_MAX_MB,
 )
 from models.bot import BotInstance, BotMessage, NanobotImage
 app = FastAPI(title="Dashboard Nanobot API")
 logger = logging.getLogger("dashboard.backend")
 app.add_middleware(
    CORSMiddleware,
@ -55,6 +68,7 @@ os.makedirs(DATA_ROOT, exist_ok=True)
 docker_manager = BotDockerManager(host_data_root=BOTS_WORKSPACE_ROOT)
 config_manager = BotConfigManager(host_data_root=BOTS_WORKSPACE_ROOT)
 speech_service = WhisperSpeechService()
 BOT_ID_PATTERN = re.compile(r"^[A-Za-z0-9_]+$")
@ -501,6 +515,13 @@ def get_system_defaults():
        "limits": {
            "upload_max_mb": UPLOAD_MAX_MB,
        },
        "speech": {
            "enabled": STT_ENABLED,
            "model": STT_MODEL,
            "device": STT_DEVICE,
            "max_audio_seconds": STT_MAX_AUDIO_SECONDS,
            "default_language": STT_DEFAULT_LANGUAGE,
        },
    }
@ -3117,6 +3138,102 @@ async def upload_workspace_files(
    return {"bot_id": bot_id, "files": rows}
@app.post("/api/bots/{bot_id}/speech/transcribe")
 async def transcribe_bot_speech(
    bot_id: str,
    file: UploadFile = File(...),
    language: Optional[str] = Form(None),
    session: Session = Depends(get_session),
 ):
    bot = session.get(BotInstance, bot_id)
    if not bot:
        raise HTTPException(status_code=404, detail="Bot not found")
    if not STT_ENABLED:
        raise HTTPException(status_code=400, detail="Speech recognition is disabled")
    if not file:
        raise HTTPException(status_code=400, detail="no audio file uploaded")
    original_name = str(file.filename or "audio.webm").strip() or "audio.webm"
    safe_name = os.path.basename(original_name).replace("\\", "_").replace("/", "_")
    ext = os.path.splitext(safe_name)[1].strip().lower() or ".webm"
    if len(ext) > 12:
        ext = ".webm"
    tmp_path = ""
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix=".speech_", dir=DATA_ROOT) as tmp:
            tmp_path = tmp.name
            while True:
                chunk = await file.read(1024 * 1024)
                if not chunk:
                    break
                tmp.write(chunk)
        if not tmp_path or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
            raise HTTPException(status_code=400, detail="audio payload is empty")
        resolved_language = str(language or "").strip() or STT_DEFAULT_LANGUAGE
        result = await asyncio.to_thread(speech_service.transcribe_file, tmp_path, resolved_language)
        text = str(result.get("text") or "").strip()
        if not text:
            raise HTTPException(status_code=400, detail="No speech detected")
        return {
            "bot_id": bot_id,
            "text": text,
            "duration_seconds": result.get("duration_seconds"),
            "max_audio_seconds": STT_MAX_AUDIO_SECONDS,
            "model": STT_MODEL,
            "device": STT_DEVICE,
            "language": result.get("language") or resolved_language,
        }
    except SpeechDisabledError as exc:
        logger.warning(
            "speech transcribe disabled bot_id=%s file=%s language=%s detail=%s",
            bot_id,
            safe_name,
            language,
            exc,
        )
        raise HTTPException(status_code=400, detail=str(exc))
    except SpeechDurationError:
        logger.warning(
            "speech transcribe too long bot_id=%s file=%s language=%s max_seconds=%s",
            bot_id,
            safe_name,
            language,
            STT_MAX_AUDIO_SECONDS,
        )
        raise HTTPException(status_code=413, detail=f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
    except SpeechServiceError as exc:
        logger.exception(
            "speech transcribe failed bot_id=%s file=%s language=%s",
            bot_id,
            safe_name,
            language,
        )
        raise HTTPException(status_code=400, detail=str(exc))
    except HTTPException:
        raise
    except Exception as exc:
        logger.exception(
            "speech transcribe unexpected error bot_id=%s file=%s language=%s",
            bot_id,
            safe_name,
            language,
        )
        raise HTTPException(status_code=500, detail=f"speech transcription failed: {exc}")
    finally:
        try:
            await file.close()
        except Exception:
            pass
        if tmp_path and os.path.exists(tmp_path):
            try:
                os.remove(tmp_path)
            except Exception:
                pass
@app.websocket("/ws/monitor/{bot_id}")
 async def websocket_endpoint(websocket: WebSocket, bot_id: str):
    with Session(engine) as session:
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -15,3 +15,5 @@ watchfiles==0.21.0
 urllib3==1.26.18
 requests==2.31.0
 redis==5.0.8
 opencc-purepy==1.1.0
 pywhispercpp==1.3.1
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@ -24,6 +24,16 @@ services:
      REDIS_PREFIX: ${REDIS_PREFIX:-dashboard_nanobot}
      REDIS_DEFAULT_TTL: ${REDIS_DEFAULT_TTL:-60}
      PANEL_ACCESS_PASSWORD: ${PANEL_ACCESS_PASSWORD:-}
      STT_ENABLED: ${STT_ENABLED:-true}
      STT_MODEL: ${STT_MODEL:-ggml-small-q8_0.bin}
      STT_MODEL_DIR: ${STT_MODEL_DIR:-${HOST_DATA_ROOT}/model}
      STT_DEVICE: ${STT_DEVICE:-cpu}
      STT_MAX_AUDIO_SECONDS: ${STT_MAX_AUDIO_SECONDS:-20}
      STT_DEFAULT_LANGUAGE: ${STT_DEFAULT_LANGUAGE:-zh}
      STT_FORCE_SIMPLIFIED: ${STT_FORCE_SIMPLIFIED:-true}
      STT_AUDIO_PREPROCESS: ${STT_AUDIO_PREPROCESS:-true}
      STT_AUDIO_FILTER: ${STT_AUDIO_FILTER:-highpass=f=120,lowpass=f=7600,afftdn=nf=-20}
      STT_INITIAL_PROMPT: ${STT_INITIAL_PROMPT:-以下内容可能包含简体中文和英文术语。请优先输出简体中文，英文单词、缩写、品牌名和数字保持原文，不要翻译。}
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
      - ${HOST_DATA_ROOT}:${HOST_DATA_ROOT}
--- a/frontend/src/i18n/dashboard.en.ts
+++ b/frontend/src/i18n/dashboard.en.ts
@ -28,7 +28,19 @@ export const dashboardEn = {
  copyPromptFail: 'Failed to copy prompt.',
  editPromptDone: 'Inserted into composer.',
  voiceInput: 'Voice input',
-  voiceUnavailable: 'Voice input is not available yet.',
+  textInput: 'Text input',
  voiceUnavailable: 'Speech recognition is disabled.',
  voiceUnsupported: 'Your browser does not support audio recording.',
  voicePermissionDenied: 'Microphone permission denied. Please allow access in browser settings.',
  voiceRecordFail: 'Audio recording failed. Please retry.',
  voiceReady: 'Click the mic to start recording',
  voiceRecording: 'Recording...',
  voiceTranscribing: 'Transcribing...',
  voiceStart: 'Start recording',
  voiceStop: 'Stop recording',
  voiceTranscribeDone: 'Voice converted to text.',
  voiceTranscribeEmpty: 'No valid speech detected.',
  voiceTranscribeFail: 'Speech transcription failed.',
  copyReply: 'Copy reply',
  copyReplyDone: 'Reply copied.',
  copyReplyFail: 'Failed to copy reply.',
--- a/frontend/src/i18n/dashboard.zh-cn.ts
+++ b/frontend/src/i18n/dashboard.zh-cn.ts
@ -28,7 +28,19 @@ export const dashboardZhCn = {
  copyPromptFail: '复制指令失败。',
  editPromptDone: '已填入输入框。',
  voiceInput: '语音输入',
-  voiceUnavailable: '语音输入暂未接入。',
+  textInput: '文字输入',
  voiceUnavailable: '语音识别未启用。',
  voiceUnsupported: '当前浏览器不支持录音。',
  voicePermissionDenied: '麦克风权限被拒绝，请在浏览器设置中允许访问。',
  voiceRecordFail: '录音失败，请重试。',
  voiceReady: '点击麦克风开始录音',
  voiceRecording: '录音中...',
  voiceTranscribing: '语音识别中...',
  voiceStart: '开始录音',
  voiceStop: '停止录音',
  voiceTranscribeDone: '语音已转为文本。',
  voiceTranscribeEmpty: '未识别到有效语音内容。',
  voiceTranscribeFail: '语音识别失败。',
  copyReply: '复制回复',
  copyReplyDone: '回复已复制。',
  copyReplyFail: '复制回复失败。',
--- a/frontend/src/modules/dashboard/BotDashboardModule.css
+++ b/frontend/src/modules/dashboard/BotDashboardModule.css
@ -1085,17 +1085,126 @@
  padding: 14px 120px 42px 14px;
 }
 .ops-voice-panel {
  min-height: 96px;
  border: 1px dashed color-mix(in oklab, var(--line) 72%, var(--brand) 28%);
  border-radius: 12px;
  background: color-mix(in oklab, var(--panel) 78%, var(--panel-soft) 22%);
  padding: 12px 14px 12px 14px;
  display: grid;
  align-content: center;
  gap: 10px;
 }
 .ops-voice-title {
  font-size: 13px;
  font-weight: 700;
  color: var(--muted);
 }
 .ops-voice-wave {
  height: 28px;
  border-radius: 999px;
  border: 1px solid color-mix(in oklab, var(--line) 76%, transparent);
  background: color-mix(in oklab, var(--panel-soft) 78%, var(--panel) 22%);
  display: flex;
  align-items: center;
  gap: 8px;
  padding: 0 6px;
  overflow: hidden;
  flex: 1 1 auto;
  min-width: 0;
 }
 .ops-voice-wave-segment {
  height: 100%;
  min-width: 0;
  display: flex;
  align-items: center;
  justify-content: space-between;
  gap: 2px;
  padding: 0 6px;
  border-radius: 999px;
  background: color-mix(in oklab, var(--panel) 60%, rgba(255, 255, 255, 0.18) 40%);
 }
 .ops-voice-wave.is-mobile .ops-voice-wave-segment {
  flex: 1 1 auto;
 }
 .ops-voice-wave.is-desktop .ops-voice-wave-segment {
  flex: 1 1 0;
 }
 .ops-voice-wave-segment i {
  display: inline-block;
  width: 3px;
  min-width: 3px;
  height: 10px;
  border-radius: 999px;
  background: color-mix(in oklab, var(--line) 72%, var(--text) 28%);
  opacity: 0.72;
 }
 .ops-voice-wave-segment i:nth-child(3n) {
  height: 14px;
 }
 .ops-voice-wave-segment i:nth-child(4n) {
  height: 18px;
 }
 .ops-voice-wave-segment i:nth-child(5n) {
  height: 12px;
 }
 .ops-voice-wave.is-live .ops-voice-wave-segment i {
  background: color-mix(in oklab, var(--brand) 60%, #8ec3ff 40%);
  animation: ops-voice-wave 1.05s ease-in-out infinite;
 }
 .ops-voice-countdown {
  flex: 0 0 auto;
  font-size: 13px;
  font-weight: 700;
  color: var(--title);
  min-width: 44px;
  text-align: right;
 }
 .ops-composer-tools-right {
  position: absolute;
  bottom: 14px;
-  display: inline-flex;
+  left: 12px;
  right: 12px;
  display: flex;
  align-items: center;
  justify-content: flex-end;
  gap: 6px;
 }
 .ops-composer-tools-right {
-  right: 12px;
+  width: auto;
-  max-width: calc(100% - 24px);
+}
 .ops-voice-inline {
  min-width: 0;
  flex: 1 1 auto;
  display: flex;
  align-items: center;
  gap: 8px;
  margin-right: 4px;
 }
@media (max-width: 720px) {
  .ops-voice-wave {
    gap: 4px;
    padding: 0 4px;
  }
  .ops-voice-wave-segment {
    padding: 0 4px;
  }
 }
 .ops-composer-inline-btn {
@ -1116,6 +1225,11 @@
  color: var(--icon);
 }
 .ops-composer-inline-btn.is-active {
  background: color-mix(in oklab, var(--brand-soft) 42%, var(--panel) 58%);
  color: var(--brand);
 }
 .ops-composer-submit-btn {
  width: 34px;
  height: 34px;
@ -1224,6 +1338,17 @@
  100% { transform: translateX(430%); }
 }
@keyframes ops-voice-wave {
  0%, 100% {
    transform: scaleY(0.55);
    opacity: 0.35;
  }
  50% {
    transform: scaleY(1.95);
    opacity: 1;
  }
 }
 .ops-pending-chip {
  display: inline-flex;
  align-items: center;
--- a/frontend/src/modules/dashboard/BotDashboardModule.tsx
+++ b/frontend/src/modules/dashboard/BotDashboardModule.tsx
@ -249,6 +249,12 @@ interface SystemDefaultsResponse {
  limits?: {
    upload_max_mb?: number;
  };
  speech?: {
    enabled?: boolean;
    model?: string;
    device?: string;
    max_audio_seconds?: number;
  };
 }
 type BotEnvParams = Record<string, string>;
@ -719,6 +725,11 @@ export function BotDashboardModule({
  const fileNotPreviewableLabel = locale === 'zh' ? '当前文件类型不支持预览' : 'This file type is not previewable';
  const [selectedBotId, setSelectedBotId] = useState('');
  const [command, setCommand] = useState('');
  const [speechEnabled, setSpeechEnabled] = useState(true);
  const [voiceMaxSeconds, setVoiceMaxSeconds] = useState(20);
  const [isVoiceRecording, setIsVoiceRecording] = useState(false);
  const [isVoiceTranscribing, setIsVoiceTranscribing] = useState(false);
  const [voiceCountdown, setVoiceCountdown] = useState(20);
  const [isSaving, setIsSaving] = useState(false);
  const [showBaseModal, setShowBaseModal] = useState(false);
  const [showParamModal, setShowParamModal] = useState(false);
@ -798,6 +809,10 @@ export function BotDashboardModule({
  const [feedbackSavingByMessageId, setFeedbackSavingByMessageId] = useState<Record<number, boolean>>({});
  const [showRuntimeActionModal, setShowRuntimeActionModal] = useState(false);
  const [workspaceHoverCard, setWorkspaceHoverCard] = useState<WorkspaceHoverCardState | null>(null);
  const voiceRecorderRef = useRef<MediaRecorder | null>(null);
  const voiceStreamRef = useRef<MediaStream | null>(null);
  const voiceChunksRef = useRef<BlobPart[]>([]);
  const voiceTimerRef = useRef<number | null>(null);
  const runtimeMenuRef = useRef<HTMLDivElement | null>(null);
  const botOrderRef = useRef<Record<string, number>>({});
  const nextBotOrderRef = useRef(1);
@ -1544,16 +1559,36 @@ export function BotDashboardModule({
    persistComposerDraft(selectedBotId, command, pendingAttachments);
  }, [selectedBotId, composerDraftHydrated, command, pendingAttachments]);
  useEffect(() => {
    return () => {
      clearVoiceTimer();
      try {
        if (voiceRecorderRef.current && voiceRecorderRef.current.state !== 'inactive') {
          voiceRecorderRef.current.stop();
        }
      } catch {
        // ignore
      }
      releaseVoiceStream();
    };
  }, []);
  useEffect(() => {
    if (!isVoiceRecording && !isVoiceTranscribing) {
      setVoiceCountdown(voiceMaxSeconds);
    }
  }, [voiceMaxSeconds, isVoiceRecording, isVoiceTranscribing]);
  useEffect(() => {
    const hasDraft = Boolean(String(command || '').trim()) || pendingAttachments.length > 0 || Boolean(quotedReply);
-    if (!hasDraft && !isUploadingAttachments) return;
+    if (!hasDraft && !isUploadingAttachments && !isVoiceRecording && !isVoiceTranscribing) return;
    const onBeforeUnload = (event: BeforeUnloadEvent) => {
      event.preventDefault();
      event.returnValue = '';
    };
    window.addEventListener('beforeunload', onBeforeUnload);
    return () => window.removeEventListener('beforeunload', onBeforeUnload);
-  }, [command, pendingAttachments.length, quotedReply, isUploadingAttachments]);
+  }, [command, pendingAttachments.length, quotedReply, isUploadingAttachments, isVoiceRecording, isVoiceTranscribing]);
  const syncChatScrollToBottom = useCallback((behavior: ScrollBehavior = 'auto') => {
    const box = chatScrollRef.current;
@ -1580,6 +1615,9 @@ export function BotDashboardModule({
  useEffect(() => {
    setQuotedReply(null);
    if (isVoiceRecording) {
      stopVoiceRecording();
    }
  }, [selectedBotId]);
  useEffect(() => {
@ -1637,9 +1675,21 @@ export function BotDashboardModule({
    const loadSystemDefaults = async () => {
      try {
        const res = await axios.get<SystemDefaultsResponse>(`${APP_ENDPOINTS.apiBase}/system/defaults`);
        if (!alive) return;
        const configured = Number(res.data?.limits?.upload_max_mb);
-        if (!Number.isFinite(configured) || configured <= 0 || !alive) return;
+        if (Number.isFinite(configured) && configured > 0) {
          setUploadMaxMb(Math.max(1, Math.floor(configured)));
        }
        const speechEnabledRaw = res.data?.speech?.enabled;
        if (typeof speechEnabledRaw === 'boolean') {
          setSpeechEnabled(speechEnabledRaw);
        }
        const speechSeconds = Number(res.data?.speech?.max_audio_seconds);
        if (Number.isFinite(speechSeconds) && speechSeconds > 0) {
          const normalized = Math.max(5, Math.floor(speechSeconds));
          setVoiceMaxSeconds(normalized);
          setVoiceCountdown(normalized);
        }
      } catch {
        // keep default limit
      }
@ -2642,8 +2692,155 @@ export function BotDashboardModule({
    filePickerRef.current?.click();
  };
-  const onVoiceInput = () => {
+  const clearVoiceTimer = () => {
    if (voiceTimerRef.current) {
      window.clearInterval(voiceTimerRef.current);
      voiceTimerRef.current = null;
    }
  };
  const releaseVoiceStream = () => {
    if (voiceStreamRef.current) {
      voiceStreamRef.current.getTracks().forEach((track) => {
        try {
          track.stop();
        } catch {
          // ignore
        }
      });
      voiceStreamRef.current = null;
    }
  };
  const transcribeVoiceBlob = async (blob: Blob) => {
    if (!selectedBot || blob.size <= 0) return;
    setIsVoiceTranscribing(true);
    try {
      const mime = String(blob.type || '').toLowerCase();
      const ext = mime.includes('ogg') ? 'ogg' : mime.includes('mp4') ? 'mp4' : 'webm';
      const file = new File([blob], `voice-input-${Date.now()}.${ext}`, { type: blob.type || 'audio/webm' });
      const formData = new FormData();
      formData.append('file', file);
      formData.append('language', 'zh');
      const res = await axios.post<{ text?: string }>(
        `${APP_ENDPOINTS.apiBase}/bots/${selectedBot.id}/speech/transcribe`,
        formData,
        { timeout: 120000 },
      );
      const text = normalizeUserMessageText(String(res.data?.text || ''));
      if (!text) {
        notify(t.voiceTranscribeEmpty, { tone: 'warning' });
        return;
      }
      setCommand((prev) => {
        const base = String(prev || '').trim();
        if (!base) return text;
        return `${base}\n${text}`;
      });
      window.requestAnimationFrame(() => composerTextareaRef.current?.focus());
      notify(t.voiceTranscribeDone, { tone: 'success' });
    } catch (error: any) {
      const msg = String(error?.response?.data?.detail || '').trim();
      console.error('Speech transcription failed', {
        botId: selectedBot.id,
        message: msg || t.voiceTranscribeFail,
        status: error?.response?.status,
        response: error?.response?.data,
        error,
      });
      notify(msg || t.voiceTranscribeFail, { tone: 'error' });
    } finally {
      setIsVoiceTranscribing(false);
    }
  };
  const stopVoiceRecording = () => {
    const recorder = voiceRecorderRef.current;
    if (!recorder || recorder.state === 'inactive') return;
    try {
      recorder.stop();
    } catch {
      // ignore
    }
  };
  const startVoiceRecording = async () => {
    if (!selectedBot || !canChat || isVoiceTranscribing) return;
    if (!speechEnabled) {
      notify(t.voiceUnavailable, { tone: 'warning' });
      return;
    }
    if (typeof window === 'undefined' || typeof navigator === 'undefined' || !navigator.mediaDevices?.getUserMedia) {
      notify(t.voiceUnsupported, { tone: 'error' });
      return;
    }
    if (typeof MediaRecorder === 'undefined') {
      notify(t.voiceUnsupported, { tone: 'error' });
      return;
    }
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      const mimeCandidates = ['audio/webm;codecs=opus', 'audio/webm', 'audio/ogg;codecs=opus', 'audio/mp4'];
      const supportedMime = mimeCandidates.find((candidate) => MediaRecorder.isTypeSupported(candidate));
      const recorder = supportedMime
        ? new MediaRecorder(stream, { mimeType: supportedMime })
        : new MediaRecorder(stream);
      voiceStreamRef.current = stream;
      voiceRecorderRef.current = recorder;
      voiceChunksRef.current = [];
      setVoiceCountdown(voiceMaxSeconds);
      setIsVoiceRecording(true);
      recorder.ondataavailable = (event: BlobEvent) => {
        if (event.data && event.data.size > 0) {
          voiceChunksRef.current.push(event.data);
        }
      };
      recorder.onerror = () => {
        setIsVoiceRecording(false);
        clearVoiceTimer();
        releaseVoiceStream();
        notify(t.voiceRecordFail, { tone: 'error' });
      };
      recorder.onstop = () => {
        const blob = new Blob(voiceChunksRef.current, { type: supportedMime || recorder.mimeType || 'audio/webm' });
        voiceRecorderRef.current = null;
        voiceChunksRef.current = [];
        clearVoiceTimer();
        releaseVoiceStream();
        setIsVoiceRecording(false);
        setVoiceCountdown(voiceMaxSeconds);
        if (blob.size > 0) {
          void transcribeVoiceBlob(blob);
        }
      };
      recorder.start(200);
      clearVoiceTimer();
      voiceTimerRef.current = window.setInterval(() => {
        setVoiceCountdown((prev) => {
          if (prev <= 1) {
            stopVoiceRecording();
            return 0;
          }
          return prev - 1;
        });
      }, 1000);
    } catch {
      releaseVoiceStream();
      setIsVoiceRecording(false);
      clearVoiceTimer();
      notify(t.voicePermissionDenied, { tone: 'error' });
    }
  };
  const onVoiceInput = () => {
    if (isVoiceTranscribing) return;
    if (isVoiceRecording) {
      stopVoiceRecording();
      return;
    }
    void startVoiceRecording();
  };
  const onPickAttachments = async (event: ChangeEvent<HTMLInputElement>) => {
@ -3393,7 +3590,7 @@ export function BotDashboardModule({
                      value={command}
                      onChange={(e) => setCommand(e.target.value)}
                      onKeyDown={onComposerKeyDown}
-                      disabled={!canChat}
+                      disabled={!canChat || isVoiceRecording || isVoiceTranscribing}
                      placeholder={
                        canChat
                          ? t.inputPlaceholder
@ -3401,18 +3598,54 @@ export function BotDashboardModule({
                      }
                    />
                    <div className="ops-composer-tools-right">
-                      <LucentIconButton
+                      {(isVoiceRecording || isVoiceTranscribing) ? (
-                        className="ops-composer-inline-btn"
+                        <div className="ops-voice-inline" aria-live="polite">
-                        disabled={!canChat}
+                          <div className={`ops-voice-wave ${isVoiceRecording ? 'is-live' : ''} ${isCompactMobile ? 'is-mobile' : 'is-desktop'}`}>
                            {Array.from({ length: isCompactMobile ? 1 : 5 }).map((_, segmentIdx) => (
                              <div key={`vw-segment-${segmentIdx}`} className="ops-voice-wave-segment">
                                {Array.from({ length: isCompactMobile ? 28 : 18 }).map((_, idx) => {
                                  const delayIndex = isCompactMobile
                                    ? idx
                                    : (segmentIdx * 18) + idx;
                                  return (
                                    <i
                                      key={`vw-inline-${segmentIdx}-${idx}`}
                                      style={{ animationDelay: `${(delayIndex % 14) * 0.06}s` }}
                                    />
                                  );
                                })}
                              </div>
                            ))}
                          </div>
                          <div className="ops-voice-countdown mono">
                            {isVoiceRecording ? `${voiceCountdown}s` : t.voiceTranscribing}
                          </div>
                        </div>
                      ) : null}
                      <button
                        className={`ops-composer-inline-btn ${isVoiceRecording ? 'is-recording' : ''}`}
                        disabled={!canChat || isVoiceTranscribing || (!speechEnabled && !isVoiceRecording)}
                        onClick={onVoiceInput}
-                        tooltip={t.voiceInput}
+                        aria-label={isVoiceRecording ? t.voiceStop : t.voiceStart}
-                        aria-label={t.voiceInput}
+                        title={
                          isVoiceTranscribing
                            ? t.voiceTranscribing
                            : isVoiceRecording
                              ? t.voiceStop
                              : t.voiceStart
                        }
                      >
                        {isVoiceTranscribing ? (
                          <RefreshCw size={16} className="animate-spin" />
                        ) : isVoiceRecording ? (
                          <Square size={16} />
                        ) : (
                          <Mic size={16} />
-                      </LucentIconButton>
+                        )}
                      </button>
                      <LucentIconButton
                        className="ops-composer-inline-btn"
-                        disabled={!canChat || isUploadingAttachments}
+                        disabled={!canChat || isUploadingAttachments || isVoiceRecording || isVoiceTranscribing}
                        onClick={triggerPickAttachments}
                        tooltip={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
                        aria-label={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
@ -3424,7 +3657,12 @@ export function BotDashboardModule({
                        disabled={
                          isChatEnabled && (isThinking || isSending)
                            ? Boolean(interruptingByBot[selectedBot.id])
-                            : (!isChatEnabled || (!command.trim() && pendingAttachments.length === 0 && !quotedReply))
+                            : (
                              !isChatEnabled
                              || isVoiceRecording
                              || isVoiceTranscribing
                              || (!command.trim() && pendingAttachments.length === 0 && !quotedReply)
                            )
                        }
                        onClick={() => void (isChatEnabled && (isThinking || isSending) ? interruptExecution() : send())}
                        aria-label={isChatEnabled && (isThinking || isSending) ? t.interrupt : t.send}
--- a/scripts/deploy-prod.sh
+++ b/scripts/deploy-prod.sh
@ -11,6 +11,7 @@ if [[ ! -f "$ENV_FILE" ]]; then
 fi
 echo "[deploy] using env: $ENV_FILE"
 docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" config -q
 docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" up -d --build
 echo "[deploy] service status"
--- a/scripts/stop-prod.sh
+++ b/scripts/stop-prod.sh
@ -4,4 +4,9 @@ set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
 ENV_FILE="${1:-$ROOT_DIR/.env.prod}"
 if [[ ! -f "$ENV_FILE" ]]; then
  echo "Missing env file: $ENV_FILE"
  exit 1
 fi
 docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" down