v0.1.4-p1

2026-03-12 01:20:57 +08:00 · 2026-03-12 01:20:57 +08:00 · 6795fedbfe
parent 590eae9f0c
commit 6795fedbfe
15 changed files with 853 additions and 21 deletions
--- a/.env.prod.example
+++ b/.env.prod.example
@ -43,3 +43,15 @@ PANEL_ACCESS_PASSWORD=change_me_panel_password

 # Max upload size for backend validation (MB)
 UPLOAD_MAX_MB=200
+
+# Local speech-to-text (Whisper via whisper.cpp model file)
+STT_ENABLED=true
+STT_MODEL=ggml-small-q8_0.bin
+STT_MODEL_DIR=${HOST_DATA_ROOT}/model
+STT_DEVICE=cpu
+STT_MAX_AUDIO_SECONDS=20
+STT_DEFAULT_LANGUAGE=zh
+STT_FORCE_SIMPLIFIED=true
+STT_AUDIO_PREPROCESS=true
+STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20
+STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文，英文单词、缩写、品牌名和数字保持原文，不要翻译。
--- a/README.md
+++ b/README.md
@ -104,6 +104,8 @@ graph TD
   - 配置绝对路径：
     - `HOST_DATA_ROOT`
     - `HOST_BOTS_WORKSPACE_ROOT`
+   - 如启用本地语音识别，请将 Whisper `.bin` 模型文件放到 `${HOST_DATA_ROOT}/model/`
+     并让 `STT_MODEL` 指向完整文件名，例如 `ggml-small-q8_0.bin`
   - 中国网络建议配置加速项：
     - `PIP_INDEX_URL`、`PIP_TRUSTED_HOST`
     - `NPM_REGISTRY`
@ -120,3 +122,4 @@ graph TD
 - 必须挂载 `/var/run/docker.sock`，否则后端无法操作 Bot 镜像与容器。
 - `HOST_BOTS_WORKSPACE_ROOT` 必须是宿主机绝对路径，并且在 `docker-compose.prod.yml` 中以“同路径”挂载到后端容器。
  原因：后端通过 Docker API 创建 Bot 容器时，使用的是宿主机可见的 bind 路径。
+- 语音识别当前基于 `pywhispercpp==1.3.1` + Whisper `.bin` 模型文件，不使用 `faster-whisper`。
--- a/backend/.env.example
+++ b/backend/.env.example
@ -27,6 +27,18 @@ PANEL_ACCESS_PASSWORD=
 # Max upload size for backend validation (MB)
 UPLOAD_MAX_MB=100

+# Local speech-to-text (Whisper via whisper.cpp model file)
+STT_ENABLED=true
+STT_MODEL=ggml-small-q8_0.bin
+STT_MODEL_DIR=../data/model
+STT_DEVICE=cpu
+STT_MAX_AUDIO_SECONDS=20
+STT_DEFAULT_LANGUAGE=zh
+STT_FORCE_SIMPLIFIED=true
+STT_AUDIO_PREPROCESS=true
+STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20
+STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文，英文单词、缩写、品牌名和数字保持原文，不要翻译。
+
 # Local backend server options (for `python3 main.py`)
 APP_HOST=0.0.0.0
 APP_PORT=8000
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@ -13,6 +13,9 @@ ARG PIP_TRUSTED_HOST=
 COPY backend/requirements.txt ./requirements.txt
 RUN if [ -n "${PIP_INDEX_URL}" ]; then pip config set global.index-url "${PIP_INDEX_URL}"; fi \
    && if [ -n "${PIP_TRUSTED_HOST}" ]; then pip config set global.trusted-host "${PIP_TRUSTED_HOST}"; fi \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends ffmpeg \
+    && rm -rf /var/lib/apt/lists/* \
    && pip install --upgrade pip \
    && pip install -r requirements.txt

--- a/backend/core/settings.py
+++ b/backend/core/settings.py
@ -48,6 +48,7 @@ def _normalize_dir_path(path_value: str) -> str:
    raw = str(path_value or "").strip()
    if not raw:
        return raw
+    raw = os.path.expandvars(os.path.expanduser(raw))
    p = Path(raw)
    if p.is_absolute():
        return str(p)
@ -117,6 +118,26 @@ DATABASE_ENGINE: Final[str] = _database_engine(DATABASE_URL)
 DATABASE_URL_DISPLAY: Final[str] = _mask_database_url(DATABASE_URL)
 DATABASE_ECHO: Final[bool] = _env_bool("DATABASE_ECHO", True)
 UPLOAD_MAX_MB: Final[int] = _env_int("UPLOAD_MAX_MB", 100, 1, 2048)
+STT_ENABLED: Final[bool] = _env_bool("STT_ENABLED", True)
+STT_MODEL: Final[str] = str(os.getenv("STT_MODEL") or "ggml-small-q8_0.bin").strip()
+_DEFAULT_STT_MODEL_DIR: Final[Path] = (Path(DATA_ROOT) / "model").resolve()
+_configured_stt_model_dir = _normalize_dir_path(os.getenv("STT_MODEL_DIR", str(_DEFAULT_STT_MODEL_DIR)))
+if _configured_stt_model_dir and not Path(_configured_stt_model_dir).exists() and _DEFAULT_STT_MODEL_DIR.exists():
+    STT_MODEL_DIR: Final[str] = str(_DEFAULT_STT_MODEL_DIR)
+else:
+    STT_MODEL_DIR: Final[str] = _configured_stt_model_dir
+STT_DEVICE: Final[str] = str(os.getenv("STT_DEVICE") or "cpu").strip().lower() or "cpu"
+STT_MAX_AUDIO_SECONDS: Final[int] = _env_int("STT_MAX_AUDIO_SECONDS", 20, 5, 600)
+STT_DEFAULT_LANGUAGE: Final[str] = str(os.getenv("STT_DEFAULT_LANGUAGE") or "zh").strip().lower() or "zh"
+STT_FORCE_SIMPLIFIED: Final[bool] = _env_bool("STT_FORCE_SIMPLIFIED", True)
+STT_AUDIO_PREPROCESS: Final[bool] = _env_bool("STT_AUDIO_PREPROCESS", True)
+STT_AUDIO_FILTER: Final[str] = str(
+    os.getenv("STT_AUDIO_FILTER") or "highpass=f=120,lowpass=f=7600,afftdn=nf=-20"
+).strip()
+STT_INITIAL_PROMPT: Final[str] = str(
+    os.getenv("STT_INITIAL_PROMPT")
+    or "以下内容可能包含简体中文和英文术语。请优先输出简体中文，英文单词、缩写、品牌名和数字保持原文，不要翻译。"
+).strip()

 REDIS_ENABLED: Final[bool] = _env_bool("REDIS_ENABLED", False)
 REDIS_URL: Final[str] = str(os.getenv("REDIS_URL") or "").strip()
--- a/backend/core/speech_service.py
+++ b/backend/core/speech_service.py
@ -0,0 +1,259 @@
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+import tempfile
+import threading
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from core.settings import (
+    STT_AUDIO_FILTER,
+    STT_AUDIO_PREPROCESS,
+    STT_DEVICE,
+    STT_ENABLED,
+    STT_FORCE_SIMPLIFIED,
+    STT_INITIAL_PROMPT,
+    STT_MAX_AUDIO_SECONDS,
+    STT_MODEL,
+    STT_MODEL_DIR,
+)
+
+
+class SpeechServiceError(RuntimeError):
+    pass
+
+
+class SpeechDisabledError(SpeechServiceError):
+    pass
+
+
+class SpeechDurationError(SpeechServiceError):
+    pass
+
+
+class WhisperSpeechService:
+    def __init__(self) -> None:
+        self._model: Any = None
+        self._model_source: str = ""
+        self._backend: str = ""
+        self._model_lock = threading.Lock()
+
+    def _resolve_model_source(self) -> str:
+        model = str(STT_MODEL or "").strip()
+        model_dir = str(STT_MODEL_DIR or "").strip()
+
+        if not model:
+            raise SpeechServiceError(
+                "STT_MODEL is empty. Please set the full model file name, e.g. ggml-samll-q8_0.bin."
+            )
+
+        # If STT_MODEL itself is an absolute/relative path, use it directly.
+        if any(sep in model for sep in ("/", "\\")):
+            direct = Path(model).expanduser()
+            if not direct.exists() or not direct.is_file():
+                raise SpeechServiceError(f"STT model file not found: {direct}")
+            if direct.suffix.lower() != ".bin":
+                raise SpeechServiceError(
+                    "STT_MODEL must point to a whisper.cpp ggml .bin model file."
+                )
+            return str(direct.resolve())
+
+        # Strict mode: only exact filename, no alias/auto detection.
+        if Path(model).suffix.lower() != ".bin":
+            raise SpeechServiceError(
+                "STT_MODEL must be the exact model file name (with .bin), e.g. ggml-small-q8_0.bin."
+            )
+
+        if not model_dir:
+            raise SpeechServiceError("STT_MODEL_DIR is empty.")
+        root = Path(model_dir).expanduser()
+        if not root.exists() or not root.is_dir():
+            raise SpeechServiceError(f"STT_MODEL_DIR does not exist: {root}")
+        candidate = root / model
+        if not candidate.exists() or not candidate.is_file():
+            raise SpeechServiceError(
+                f"STT model file not found under STT_MODEL_DIR: {candidate}"
+            )
+        return str(candidate.resolve())
+
+    def _load_model(self) -> Any:
+        model_source = self._resolve_model_source()
+        if self._model is not None and self._model_source == model_source:
+            return self._model
+        with self._model_lock:
+            if self._model is not None and self._model_source == model_source:
+                return self._model
+            try:
+                from pywhispercpp.model import Model  # type: ignore
+            except Exception as exc:
+                raise SpeechServiceError(
+                    "pywhispercpp is not installed in the active backend environment. "
+                    "Run pip install -r backend/requirements.txt or rebuild the backend image."
+                ) from exc
+            self._model = Model(
+                model_source,
+                print_realtime=False,
+                print_progress=False,
+            )
+            self._backend = "pywhispercpp"
+            self._model_source = model_source
+            return self._model
+
+    @staticmethod
+    def _preprocess_audio(file_path: str) -> str:
+        target = str(file_path or "").strip()
+        if not STT_AUDIO_PREPROCESS or not target or not os.path.isfile(target):
+            return target
+        if shutil.which("ffmpeg") is None:
+            return target
+        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav", prefix=".speech_clean_")
+        tmp_path = tmp.name
+        tmp.close()
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-i",
+            target,
+            "-vn",
+            "-ac",
+            "1",
+            "-ar",
+            "16000",
+        ]
+        audio_filter = str(STT_AUDIO_FILTER or "").strip()
+        if audio_filter:
+            cmd.extend(["-af", audio_filter])
+        cmd.extend(["-c:a", "pcm_s16le", tmp_path])
+        try:
+            completed = subprocess.run(
+                cmd,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=False,
+            )
+            if completed.returncode != 0 or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
+                if os.path.exists(tmp_path):
+                    os.remove(tmp_path)
+                return target
+            return tmp_path
+        except Exception:
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+            return target
+
+    @staticmethod
+    def _probe_audio_duration_seconds(file_path: str) -> Optional[float]:
+        try:
+            import av  # type: ignore
+
+            with av.open(file_path) as container:
+                if container.duration is not None:
+                    # container.duration is in av.time_base units.
+                    return max(0.0, float(container.duration / av.time_base))
+                for stream in container.streams:
+                    if stream.type != "audio":
+                        continue
+                    if stream.duration is not None and stream.time_base is not None:
+                        return max(0.0, float(stream.duration * stream.time_base))
+        except Exception:
+            return None
+        return None
+
+    @staticmethod
+    def _normalize_text(text: str) -> str:
+        content = str(text or "").strip()
+        if not content or not STT_FORCE_SIMPLIFIED:
+            return content
+        try:
+            from opencc_purepy import OpenCC  # type: ignore
+
+            return str(OpenCC("t2s").convert(content) or "").strip() or content
+        except Exception:
+            return content
+
+    @staticmethod
+    def _filter_supported_transcribe_kwargs(model: Any, kwargs: Dict[str, Any]) -> Dict[str, Any]:
+        try:
+            available = set(model.get_params().keys())
+        except Exception:
+            return kwargs
+        return {key: value for key, value in kwargs.items() if key in available}
+
+    def transcribe_file(self, file_path: str, language: Optional[str] = None) -> Dict[str, Any]:
+        if not STT_ENABLED:
+            raise SpeechDisabledError("Speech-to-text is disabled")
+        target = str(file_path or "").strip()
+        if not target or not os.path.isfile(target):
+            raise SpeechServiceError("Audio file not found")
+
+        duration_seconds = self._probe_audio_duration_seconds(target)
+        if duration_seconds is not None and duration_seconds > float(STT_MAX_AUDIO_SECONDS) + 0.3:
+            raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
+
+        prepared_target = self._preprocess_audio(target)
+        try:
+            model = self._load_model()
+            lang = str(language or "").strip().lower()
+            normalized_lang: Optional[str] = None
+            if lang and lang not in {"auto", "null", "none"}:
+                normalized_lang = lang
+
+            max_end = 0.0
+            detected_language = ""
+            texts = []
+            kwargs: Dict[str, Any] = {
+                "print_realtime": False,
+                "print_progress": False,
+                "no_context": True,
+                "suppress_non_speech_tokens": True,
+            }
+            if normalized_lang:
+                kwargs["language"] = normalized_lang
+            initial_prompt = str(STT_INITIAL_PROMPT or "").strip()
+            if initial_prompt:
+                kwargs["initial_prompt"] = initial_prompt
+            kwargs = self._filter_supported_transcribe_kwargs(model, kwargs)
+            try:
+                segments = model.transcribe(prepared_target, **kwargs)
+            except Exception as exc:
+                raise SpeechServiceError(
+                    f"pywhispercpp transcription failed: {exc}. "
+                    "If input is not wav, install ffmpeg in runtime image."
+                ) from exc
+            for segment in segments:
+                txt = str(getattr(segment, "text", "") or "").strip()
+                if txt:
+                    texts.append(txt)
+                if normalized_lang:
+                    detected_language = normalized_lang
+                try:
+                    max_end = max(max_end, float(getattr(segment, "t1", 0.0) or 0.0) / 100.0)
+                except Exception:
+                    pass
+                if max_end > float(STT_MAX_AUDIO_SECONDS) + 0.3:
+                    raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
+
+            text = self._normalize_text(" ".join(texts).strip())
+            if not text:
+                raise SpeechServiceError("No speech detected")
+
+            if duration_seconds is None:
+                duration_seconds = max_end if max_end > 0 else None
+
+            return {
+                "text": text,
+                "language": detected_language or None,
+                "duration_seconds": duration_seconds,
+                "max_audio_seconds": STT_MAX_AUDIO_SECONDS,
+                "model": STT_MODEL,
+                "device": STT_DEVICE,
+                "backend": self._backend or "unknown",
+            }
+        finally:
+            if prepared_target != target and os.path.exists(prepared_target):
+                try:
+                    os.remove(prepared_target)
+                except Exception:
+                    pass
--- a/backend/main.py
+++ b/backend/main.py
@ -1,5 +1,6 @@
 import asyncio
 import json
+import logging
 import mimetypes
 import os
 import re
@ -12,7 +13,7 @@ from urllib.parse import unquote

 import httpx
 from pydantic import BaseModel
-from fastapi import Depends, FastAPI, File, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
+from fastapi import Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
 from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from sqlmodel import Session, select
@ -21,6 +22,12 @@ from core.config_manager import BotConfigManager
 from core.cache import cache
 from core.database import engine, get_session, init_database
 from core.docker_manager import BotDockerManager
+from core.speech_service import (
+    SpeechDisabledError,
+    SpeechDurationError,
+    SpeechServiceError,
+    WhisperSpeechService,
+)
 from core.settings import (
    BOTS_WORKSPACE_ROOT,
    DATA_ROOT,
@ -37,11 +44,17 @@ from core.settings import (
    REDIS_ENABLED,
    REDIS_PREFIX,
    REDIS_URL,
+    STT_DEVICE,
+    STT_DEFAULT_LANGUAGE,
+    STT_ENABLED,
+    STT_MAX_AUDIO_SECONDS,
+    STT_MODEL,
    UPLOAD_MAX_MB,
 )
 from models.bot import BotInstance, BotMessage, NanobotImage

 app = FastAPI(title="Dashboard Nanobot API")
+logger = logging.getLogger("dashboard.backend")

 app.add_middleware(
    CORSMiddleware,
@ -55,6 +68,7 @@ os.makedirs(DATA_ROOT, exist_ok=True)

 docker_manager = BotDockerManager(host_data_root=BOTS_WORKSPACE_ROOT)
 config_manager = BotConfigManager(host_data_root=BOTS_WORKSPACE_ROOT)
+speech_service = WhisperSpeechService()
 BOT_ID_PATTERN = re.compile(r"^[A-Za-z0-9_]+$")


@ -501,6 +515,13 @@ def get_system_defaults():
        "limits": {
            "upload_max_mb": UPLOAD_MAX_MB,
        },
+        "speech": {
+            "enabled": STT_ENABLED,
+            "model": STT_MODEL,
+            "device": STT_DEVICE,
+            "max_audio_seconds": STT_MAX_AUDIO_SECONDS,
+            "default_language": STT_DEFAULT_LANGUAGE,
+        },
    }


@ -3117,6 +3138,102 @@ async def upload_workspace_files(
    return {"bot_id": bot_id, "files": rows}


+@app.post("/api/bots/{bot_id}/speech/transcribe")
+async def transcribe_bot_speech(
+    bot_id: str,
+    file: UploadFile = File(...),
+    language: Optional[str] = Form(None),
+    session: Session = Depends(get_session),
+):
+    bot = session.get(BotInstance, bot_id)
+    if not bot:
+        raise HTTPException(status_code=404, detail="Bot not found")
+    if not STT_ENABLED:
+        raise HTTPException(status_code=400, detail="Speech recognition is disabled")
+    if not file:
+        raise HTTPException(status_code=400, detail="no audio file uploaded")
+
+    original_name = str(file.filename or "audio.webm").strip() or "audio.webm"
+    safe_name = os.path.basename(original_name).replace("\\", "_").replace("/", "_")
+    ext = os.path.splitext(safe_name)[1].strip().lower() or ".webm"
+    if len(ext) > 12:
+        ext = ".webm"
+
+    tmp_path = ""
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix=".speech_", dir=DATA_ROOT) as tmp:
+            tmp_path = tmp.name
+            while True:
+                chunk = await file.read(1024 * 1024)
+                if not chunk:
+                    break
+                tmp.write(chunk)
+
+        if not tmp_path or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
+            raise HTTPException(status_code=400, detail="audio payload is empty")
+
+        resolved_language = str(language or "").strip() or STT_DEFAULT_LANGUAGE
+        result = await asyncio.to_thread(speech_service.transcribe_file, tmp_path, resolved_language)
+        text = str(result.get("text") or "").strip()
+        if not text:
+            raise HTTPException(status_code=400, detail="No speech detected")
+        return {
+            "bot_id": bot_id,
+            "text": text,
+            "duration_seconds": result.get("duration_seconds"),
+            "max_audio_seconds": STT_MAX_AUDIO_SECONDS,
+            "model": STT_MODEL,
+            "device": STT_DEVICE,
+            "language": result.get("language") or resolved_language,
+        }
+    except SpeechDisabledError as exc:
+        logger.warning(
+            "speech transcribe disabled bot_id=%s file=%s language=%s detail=%s",
+            bot_id,
+            safe_name,
+            language,
+            exc,
+        )
+        raise HTTPException(status_code=400, detail=str(exc))
+    except SpeechDurationError:
+        logger.warning(
+            "speech transcribe too long bot_id=%s file=%s language=%s max_seconds=%s",
+            bot_id,
+            safe_name,
+            language,
+            STT_MAX_AUDIO_SECONDS,
+        )
+        raise HTTPException(status_code=413, detail=f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
+    except SpeechServiceError as exc:
+        logger.exception(
+            "speech transcribe failed bot_id=%s file=%s language=%s",
+            bot_id,
+            safe_name,
+            language,
+        )
+        raise HTTPException(status_code=400, detail=str(exc))
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.exception(
+            "speech transcribe unexpected error bot_id=%s file=%s language=%s",
+            bot_id,
+            safe_name,
+            language,
+        )
+        raise HTTPException(status_code=500, detail=f"speech transcription failed: {exc}")
+    finally:
+        try:
+            await file.close()
+        except Exception:
+            pass
+        if tmp_path and os.path.exists(tmp_path):
+            try:
+                os.remove(tmp_path)
+            except Exception:
+                pass
+
+
@app.websocket("/ws/monitor/{bot_id}")
 async def websocket_endpoint(websocket: WebSocket, bot_id: str):
    with Session(engine) as session:
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -15,3 +15,5 @@ watchfiles==0.21.0
 urllib3==1.26.18
 requests==2.31.0
 redis==5.0.8
+opencc-purepy==1.1.0
+pywhispercpp==1.3.1
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@ -24,6 +24,16 @@ services:
      REDIS_PREFIX: ${REDIS_PREFIX:-dashboard_nanobot}
      REDIS_DEFAULT_TTL: ${REDIS_DEFAULT_TTL:-60}
      PANEL_ACCESS_PASSWORD: ${PANEL_ACCESS_PASSWORD:-}
+      STT_ENABLED: ${STT_ENABLED:-true}
+      STT_MODEL: ${STT_MODEL:-ggml-small-q8_0.bin}
+      STT_MODEL_DIR: ${STT_MODEL_DIR:-${HOST_DATA_ROOT}/model}
+      STT_DEVICE: ${STT_DEVICE:-cpu}
+      STT_MAX_AUDIO_SECONDS: ${STT_MAX_AUDIO_SECONDS:-20}
+      STT_DEFAULT_LANGUAGE: ${STT_DEFAULT_LANGUAGE:-zh}
+      STT_FORCE_SIMPLIFIED: ${STT_FORCE_SIMPLIFIED:-true}
+      STT_AUDIO_PREPROCESS: ${STT_AUDIO_PREPROCESS:-true}
+      STT_AUDIO_FILTER: ${STT_AUDIO_FILTER:-highpass=f=120,lowpass=f=7600,afftdn=nf=-20}
+      STT_INITIAL_PROMPT: ${STT_INITIAL_PROMPT:-以下内容可能包含简体中文和英文术语。请优先输出简体中文，英文单词、缩写、品牌名和数字保持原文，不要翻译。}
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
      - ${HOST_DATA_ROOT}:${HOST_DATA_ROOT}
--- a/frontend/src/i18n/dashboard.en.ts
+++ b/frontend/src/i18n/dashboard.en.ts
@ -28,7 +28,19 @@ export const dashboardEn = {
  copyPromptFail: 'Failed to copy prompt.',
  editPromptDone: 'Inserted into composer.',
  voiceInput: 'Voice input',
-  voiceUnavailable: 'Voice input is not available yet.',
+  textInput: 'Text input',
+  voiceUnavailable: 'Speech recognition is disabled.',
+  voiceUnsupported: 'Your browser does not support audio recording.',
+  voicePermissionDenied: 'Microphone permission denied. Please allow access in browser settings.',
+  voiceRecordFail: 'Audio recording failed. Please retry.',
+  voiceReady: 'Click the mic to start recording',
+  voiceRecording: 'Recording...',
+  voiceTranscribing: 'Transcribing...',
+  voiceStart: 'Start recording',
+  voiceStop: 'Stop recording',
+  voiceTranscribeDone: 'Voice converted to text.',
+  voiceTranscribeEmpty: 'No valid speech detected.',
+  voiceTranscribeFail: 'Speech transcription failed.',
  copyReply: 'Copy reply',
  copyReplyDone: 'Reply copied.',
  copyReplyFail: 'Failed to copy reply.',
--- a/frontend/src/i18n/dashboard.zh-cn.ts
+++ b/frontend/src/i18n/dashboard.zh-cn.ts
@ -28,7 +28,19 @@ export const dashboardZhCn = {
  copyPromptFail: '复制指令失败。',
  editPromptDone: '已填入输入框。',
  voiceInput: '语音输入',
-  voiceUnavailable: '语音输入暂未接入。',
+  textInput: '文字输入',
+  voiceUnavailable: '语音识别未启用。',
+  voiceUnsupported: '当前浏览器不支持录音。',
+  voicePermissionDenied: '麦克风权限被拒绝，请在浏览器设置中允许访问。',
+  voiceRecordFail: '录音失败，请重试。',
+  voiceReady: '点击麦克风开始录音',
+  voiceRecording: '录音中...',
+  voiceTranscribing: '语音识别中...',
+  voiceStart: '开始录音',
+  voiceStop: '停止录音',
+  voiceTranscribeDone: '语音已转为文本。',
+  voiceTranscribeEmpty: '未识别到有效语音内容。',
+  voiceTranscribeFail: '语音识别失败。',
  copyReply: '复制回复',
  copyReplyDone: '回复已复制。',
  copyReplyFail: '复制回复失败。',
--- a/frontend/src/modules/dashboard/BotDashboardModule.css
+++ b/frontend/src/modules/dashboard/BotDashboardModule.css
@ -1085,17 +1085,126 @@
  padding: 14px 120px 42px 14px;
 }

+.ops-voice-panel {
+  min-height: 96px;
+  border: 1px dashed color-mix(in oklab, var(--line) 72%, var(--brand) 28%);
+  border-radius: 12px;
+  background: color-mix(in oklab, var(--panel) 78%, var(--panel-soft) 22%);
+  padding: 12px 14px 12px 14px;
+  display: grid;
+  align-content: center;
+  gap: 10px;
+}
+
+.ops-voice-title {
+  font-size: 13px;
+  font-weight: 700;
+  color: var(--muted);
+}
+
+.ops-voice-wave {
+  height: 28px;
+  border-radius: 999px;
+  border: 1px solid color-mix(in oklab, var(--line) 76%, transparent);
+  background: color-mix(in oklab, var(--panel-soft) 78%, var(--panel) 22%);
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  padding: 0 6px;
+  overflow: hidden;
+  flex: 1 1 auto;
+  min-width: 0;
+}
+
+.ops-voice-wave-segment {
+  height: 100%;
+  min-width: 0;
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: 2px;
+  padding: 0 6px;
+  border-radius: 999px;
+  background: color-mix(in oklab, var(--panel) 60%, rgba(255, 255, 255, 0.18) 40%);
+}
+
+.ops-voice-wave.is-mobile .ops-voice-wave-segment {
+  flex: 1 1 auto;
+}
+
+.ops-voice-wave.is-desktop .ops-voice-wave-segment {
+  flex: 1 1 0;
+}
+
+.ops-voice-wave-segment i {
+  display: inline-block;
+  width: 3px;
+  min-width: 3px;
+  height: 10px;
+  border-radius: 999px;
+  background: color-mix(in oklab, var(--line) 72%, var(--text) 28%);
+  opacity: 0.72;
+}
+
+.ops-voice-wave-segment i:nth-child(3n) {
+  height: 14px;
+}
+
+.ops-voice-wave-segment i:nth-child(4n) {
+  height: 18px;
+}
+
+.ops-voice-wave-segment i:nth-child(5n) {
+  height: 12px;
+}
+
+.ops-voice-wave.is-live .ops-voice-wave-segment i {
+  background: color-mix(in oklab, var(--brand) 60%, #8ec3ff 40%);
+  animation: ops-voice-wave 1.05s ease-in-out infinite;
+}
+
+.ops-voice-countdown {
+  flex: 0 0 auto;
+  font-size: 13px;
+  font-weight: 700;
+  color: var(--title);
+  min-width: 44px;
+  text-align: right;
+}
+
 .ops-composer-tools-right {
  position: absolute;
  bottom: 14px;
-  display: inline-flex;
+  left: 12px;
+  right: 12px;
+  display: flex;
  align-items: center;
+  justify-content: flex-end;
  gap: 6px;
 }

 .ops-composer-tools-right {
-  right: 12px;
-  max-width: calc(100% - 24px);
+  width: auto;
+}
+
+.ops-voice-inline {
+  min-width: 0;
+  flex: 1 1 auto;
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  margin-right: 4px;
+}
+
+@media (max-width: 720px) {
+  .ops-voice-wave {
+    gap: 4px;
+    padding: 0 4px;
+  }
+
+  .ops-voice-wave-segment {
+    padding: 0 4px;
+  }
 }

 .ops-composer-inline-btn {
@ -1116,6 +1225,11 @@
  color: var(--icon);
 }

+.ops-composer-inline-btn.is-active {
+  background: color-mix(in oklab, var(--brand-soft) 42%, var(--panel) 58%);
+  color: var(--brand);
+}
+
 .ops-composer-submit-btn {
  width: 34px;
  height: 34px;
@ -1224,6 +1338,17 @@
  100% { transform: translateX(430%); }
 }

+@keyframes ops-voice-wave {
+  0%, 100% {
+    transform: scaleY(0.55);
+    opacity: 0.35;
+  }
+  50% {
+    transform: scaleY(1.95);
+    opacity: 1;
+  }
+}
+
 .ops-pending-chip {
  display: inline-flex;
  align-items: center;
--- a/frontend/src/modules/dashboard/BotDashboardModule.tsx
+++ b/frontend/src/modules/dashboard/BotDashboardModule.tsx
@ -249,6 +249,12 @@ interface SystemDefaultsResponse {
  limits?: {
    upload_max_mb?: number;
  };
+  speech?: {
+    enabled?: boolean;
+    model?: string;
+    device?: string;
+    max_audio_seconds?: number;
+  };
 }

 type BotEnvParams = Record<string, string>;
@ -719,6 +725,11 @@ export function BotDashboardModule({
  const fileNotPreviewableLabel = locale === 'zh' ? '当前文件类型不支持预览' : 'This file type is not previewable';
  const [selectedBotId, setSelectedBotId] = useState('');
  const [command, setCommand] = useState('');
+  const [speechEnabled, setSpeechEnabled] = useState(true);
+  const [voiceMaxSeconds, setVoiceMaxSeconds] = useState(20);
+  const [isVoiceRecording, setIsVoiceRecording] = useState(false);
+  const [isVoiceTranscribing, setIsVoiceTranscribing] = useState(false);
+  const [voiceCountdown, setVoiceCountdown] = useState(20);
  const [isSaving, setIsSaving] = useState(false);
  const [showBaseModal, setShowBaseModal] = useState(false);
  const [showParamModal, setShowParamModal] = useState(false);
@ -798,6 +809,10 @@ export function BotDashboardModule({
  const [feedbackSavingByMessageId, setFeedbackSavingByMessageId] = useState<Record<number, boolean>>({});
  const [showRuntimeActionModal, setShowRuntimeActionModal] = useState(false);
  const [workspaceHoverCard, setWorkspaceHoverCard] = useState<WorkspaceHoverCardState | null>(null);
+  const voiceRecorderRef = useRef<MediaRecorder | null>(null);
+  const voiceStreamRef = useRef<MediaStream | null>(null);
+  const voiceChunksRef = useRef<BlobPart[]>([]);
+  const voiceTimerRef = useRef<number | null>(null);
  const runtimeMenuRef = useRef<HTMLDivElement | null>(null);
  const botOrderRef = useRef<Record<string, number>>({});
  const nextBotOrderRef = useRef(1);
@ -1544,16 +1559,36 @@ export function BotDashboardModule({
    persistComposerDraft(selectedBotId, command, pendingAttachments);
  }, [selectedBotId, composerDraftHydrated, command, pendingAttachments]);

+  useEffect(() => {
+    return () => {
+      clearVoiceTimer();
+      try {
+        if (voiceRecorderRef.current && voiceRecorderRef.current.state !== 'inactive') {
+          voiceRecorderRef.current.stop();
+        }
+      } catch {
+        // ignore
+      }
+      releaseVoiceStream();
+    };
+  }, []);
+
+  useEffect(() => {
+    if (!isVoiceRecording && !isVoiceTranscribing) {
+      setVoiceCountdown(voiceMaxSeconds);
+    }
+  }, [voiceMaxSeconds, isVoiceRecording, isVoiceTranscribing]);
+
  useEffect(() => {
    const hasDraft = Boolean(String(command || '').trim()) || pendingAttachments.length > 0 || Boolean(quotedReply);
-    if (!hasDraft && !isUploadingAttachments) return;
+    if (!hasDraft && !isUploadingAttachments && !isVoiceRecording && !isVoiceTranscribing) return;
    const onBeforeUnload = (event: BeforeUnloadEvent) => {
      event.preventDefault();
      event.returnValue = '';
    };
    window.addEventListener('beforeunload', onBeforeUnload);
    return () => window.removeEventListener('beforeunload', onBeforeUnload);
-  }, [command, pendingAttachments.length, quotedReply, isUploadingAttachments]);
+  }, [command, pendingAttachments.length, quotedReply, isUploadingAttachments, isVoiceRecording, isVoiceTranscribing]);

  const syncChatScrollToBottom = useCallback((behavior: ScrollBehavior = 'auto') => {
    const box = chatScrollRef.current;
@ -1580,6 +1615,9 @@ export function BotDashboardModule({

  useEffect(() => {
    setQuotedReply(null);
+    if (isVoiceRecording) {
+      stopVoiceRecording();
+    }
  }, [selectedBotId]);

  useEffect(() => {
@ -1637,9 +1675,21 @@ export function BotDashboardModule({
    const loadSystemDefaults = async () => {
      try {
        const res = await axios.get<SystemDefaultsResponse>(`${APP_ENDPOINTS.apiBase}/system/defaults`);
+        if (!alive) return;
        const configured = Number(res.data?.limits?.upload_max_mb);
-        if (!Number.isFinite(configured) || configured <= 0 || !alive) return;
-        setUploadMaxMb(Math.max(1, Math.floor(configured)));
+        if (Number.isFinite(configured) && configured > 0) {
+          setUploadMaxMb(Math.max(1, Math.floor(configured)));
+        }
+        const speechEnabledRaw = res.data?.speech?.enabled;
+        if (typeof speechEnabledRaw === 'boolean') {
+          setSpeechEnabled(speechEnabledRaw);
+        }
+        const speechSeconds = Number(res.data?.speech?.max_audio_seconds);
+        if (Number.isFinite(speechSeconds) && speechSeconds > 0) {
+          const normalized = Math.max(5, Math.floor(speechSeconds));
+          setVoiceMaxSeconds(normalized);
+          setVoiceCountdown(normalized);
+        }
      } catch {
        // keep default limit
      }
@ -2642,8 +2692,155 @@ export function BotDashboardModule({
    filePickerRef.current?.click();
  };

+  const clearVoiceTimer = () => {
+    if (voiceTimerRef.current) {
+      window.clearInterval(voiceTimerRef.current);
+      voiceTimerRef.current = null;
+    }
+  };
+
+  const releaseVoiceStream = () => {
+    if (voiceStreamRef.current) {
+      voiceStreamRef.current.getTracks().forEach((track) => {
+        try {
+          track.stop();
+        } catch {
+          // ignore
+        }
+      });
+      voiceStreamRef.current = null;
+    }
+  };
+
+  const transcribeVoiceBlob = async (blob: Blob) => {
+    if (!selectedBot || blob.size <= 0) return;
+    setIsVoiceTranscribing(true);
+    try {
+      const mime = String(blob.type || '').toLowerCase();
+      const ext = mime.includes('ogg') ? 'ogg' : mime.includes('mp4') ? 'mp4' : 'webm';
+      const file = new File([blob], `voice-input-${Date.now()}.${ext}`, { type: blob.type || 'audio/webm' });
+      const formData = new FormData();
+      formData.append('file', file);
+      formData.append('language', 'zh');
+      const res = await axios.post<{ text?: string }>(
+        `${APP_ENDPOINTS.apiBase}/bots/${selectedBot.id}/speech/transcribe`,
+        formData,
+        { timeout: 120000 },
+      );
+      const text = normalizeUserMessageText(String(res.data?.text || ''));
+      if (!text) {
+        notify(t.voiceTranscribeEmpty, { tone: 'warning' });
+        return;
+      }
+      setCommand((prev) => {
+        const base = String(prev || '').trim();
+        if (!base) return text;
+        return `${base}\n${text}`;
+      });
+      window.requestAnimationFrame(() => composerTextareaRef.current?.focus());
+      notify(t.voiceTranscribeDone, { tone: 'success' });
+    } catch (error: any) {
+      const msg = String(error?.response?.data?.detail || '').trim();
+      console.error('Speech transcription failed', {
+        botId: selectedBot.id,
+        message: msg || t.voiceTranscribeFail,
+        status: error?.response?.status,
+        response: error?.response?.data,
+        error,
+      });
+      notify(msg || t.voiceTranscribeFail, { tone: 'error' });
+    } finally {
+      setIsVoiceTranscribing(false);
+    }
+  };
+
+  const stopVoiceRecording = () => {
+    const recorder = voiceRecorderRef.current;
+    if (!recorder || recorder.state === 'inactive') return;
+    try {
+      recorder.stop();
+    } catch {
+      // ignore
+    }
+  };
+
+  const startVoiceRecording = async () => {
+    if (!selectedBot || !canChat || isVoiceTranscribing) return;
+    if (!speechEnabled) {
+      notify(t.voiceUnavailable, { tone: 'warning' });
+      return;
+    }
+    if (typeof window === 'undefined' || typeof navigator === 'undefined' || !navigator.mediaDevices?.getUserMedia) {
+      notify(t.voiceUnsupported, { tone: 'error' });
+      return;
+    }
+    if (typeof MediaRecorder === 'undefined') {
+      notify(t.voiceUnsupported, { tone: 'error' });
+      return;
+    }
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      const mimeCandidates = ['audio/webm;codecs=opus', 'audio/webm', 'audio/ogg;codecs=opus', 'audio/mp4'];
+      const supportedMime = mimeCandidates.find((candidate) => MediaRecorder.isTypeSupported(candidate));
+      const recorder = supportedMime
+        ? new MediaRecorder(stream, { mimeType: supportedMime })
+        : new MediaRecorder(stream);
+      voiceStreamRef.current = stream;
+      voiceRecorderRef.current = recorder;
+      voiceChunksRef.current = [];
+      setVoiceCountdown(voiceMaxSeconds);
+      setIsVoiceRecording(true);
+
+      recorder.ondataavailable = (event: BlobEvent) => {
+        if (event.data && event.data.size > 0) {
+          voiceChunksRef.current.push(event.data);
+        }
+      };
+      recorder.onerror = () => {
+        setIsVoiceRecording(false);
+        clearVoiceTimer();
+        releaseVoiceStream();
+        notify(t.voiceRecordFail, { tone: 'error' });
+      };
+      recorder.onstop = () => {
+        const blob = new Blob(voiceChunksRef.current, { type: supportedMime || recorder.mimeType || 'audio/webm' });
+        voiceRecorderRef.current = null;
+        voiceChunksRef.current = [];
+        clearVoiceTimer();
+        releaseVoiceStream();
+        setIsVoiceRecording(false);
+        setVoiceCountdown(voiceMaxSeconds);
+        if (blob.size > 0) {
+          void transcribeVoiceBlob(blob);
+        }
+      };
+
+      recorder.start(200);
+      clearVoiceTimer();
+      voiceTimerRef.current = window.setInterval(() => {
+        setVoiceCountdown((prev) => {
+          if (prev <= 1) {
+            stopVoiceRecording();
+            return 0;
+          }
+          return prev - 1;
+        });
+      }, 1000);
+    } catch {
+      releaseVoiceStream();
+      setIsVoiceRecording(false);
+      clearVoiceTimer();
+      notify(t.voicePermissionDenied, { tone: 'error' });
+    }
+  };
+
  const onVoiceInput = () => {
-    notify(t.voiceUnavailable, { tone: 'warning' });
+    if (isVoiceTranscribing) return;
+    if (isVoiceRecording) {
+      stopVoiceRecording();
+      return;
+    }
+    void startVoiceRecording();
  };

  const onPickAttachments = async (event: ChangeEvent<HTMLInputElement>) => {
@ -3393,7 +3590,7 @@ export function BotDashboardModule({
                      value={command}
                      onChange={(e) => setCommand(e.target.value)}
                      onKeyDown={onComposerKeyDown}
-                      disabled={!canChat}
+                      disabled={!canChat || isVoiceRecording || isVoiceTranscribing}
                      placeholder={
                        canChat
                          ? t.inputPlaceholder
@ -3401,18 +3598,54 @@ export function BotDashboardModule({
                      }
                    />
                    <div className="ops-composer-tools-right">
-                      <LucentIconButton
-                        className="ops-composer-inline-btn"
-                        disabled={!canChat}
+                      {(isVoiceRecording || isVoiceTranscribing) ? (
+                        <div className="ops-voice-inline" aria-live="polite">
+                          <div className={`ops-voice-wave ${isVoiceRecording ? 'is-live' : ''} ${isCompactMobile ? 'is-mobile' : 'is-desktop'}`}>
+                            {Array.from({ length: isCompactMobile ? 1 : 5 }).map((_, segmentIdx) => (
+                              <div key={`vw-segment-${segmentIdx}`} className="ops-voice-wave-segment">
+                                {Array.from({ length: isCompactMobile ? 28 : 18 }).map((_, idx) => {
+                                  const delayIndex = isCompactMobile
+                                    ? idx
+                                    : (segmentIdx * 18) + idx;
+                                  return (
+                                    <i
+                                      key={`vw-inline-${segmentIdx}-${idx}`}
+                                      style={{ animationDelay: `${(delayIndex % 14) * 0.06}s` }}
+                                    />
+                                  );
+                                })}
+                              </div>
+                            ))}
+                          </div>
+                          <div className="ops-voice-countdown mono">
+                            {isVoiceRecording ? `${voiceCountdown}s` : t.voiceTranscribing}
+                          </div>
+                        </div>
+                      ) : null}
+                      <button
+                        className={`ops-composer-inline-btn ${isVoiceRecording ? 'is-recording' : ''}`}
+                        disabled={!canChat || isVoiceTranscribing || (!speechEnabled && !isVoiceRecording)}
                        onClick={onVoiceInput}
-                        tooltip={t.voiceInput}
-                        aria-label={t.voiceInput}
+                        aria-label={isVoiceRecording ? t.voiceStop : t.voiceStart}
+                        title={
+                          isVoiceTranscribing
+                            ? t.voiceTranscribing
+                            : isVoiceRecording
+                              ? t.voiceStop
+                              : t.voiceStart
+                        }
                      >
-                        <Mic size={16} />
-                      </LucentIconButton>
+                        {isVoiceTranscribing ? (
+                          <RefreshCw size={16} className="animate-spin" />
+                        ) : isVoiceRecording ? (
+                          <Square size={16} />
+                        ) : (
+                          <Mic size={16} />
+                        )}
+                      </button>
                      <LucentIconButton
                        className="ops-composer-inline-btn"
-                        disabled={!canChat || isUploadingAttachments}
+                        disabled={!canChat || isUploadingAttachments || isVoiceRecording || isVoiceTranscribing}
                        onClick={triggerPickAttachments}
                        tooltip={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
                        aria-label={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
@ -3424,7 +3657,12 @@ export function BotDashboardModule({
                        disabled={
                          isChatEnabled && (isThinking || isSending)
                            ? Boolean(interruptingByBot[selectedBot.id])
-                            : (!isChatEnabled || (!command.trim() && pendingAttachments.length === 0 && !quotedReply))
+                            : (
+                              !isChatEnabled
+                              || isVoiceRecording
+                              || isVoiceTranscribing
+                              || (!command.trim() && pendingAttachments.length === 0 && !quotedReply)
+                            )
                        }
                        onClick={() => void (isChatEnabled && (isThinking || isSending) ? interruptExecution() : send())}
                        aria-label={isChatEnabled && (isThinking || isSending) ? t.interrupt : t.send}
--- a/scripts/deploy-prod.sh
+++ b/scripts/deploy-prod.sh
@ -11,6 +11,7 @@ if [[ ! -f "$ENV_FILE" ]]; then
 fi

 echo "[deploy] using env: $ENV_FILE"
+docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" config -q
 docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" up -d --build

 echo "[deploy] service status"
--- a/scripts/stop-prod.sh
+++ b/scripts/stop-prod.sh
@ -4,4 +4,9 @@ set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
 ENV_FILE="${1:-$ROOT_DIR/.env.prod}"

+if [[ ! -f "$ENV_FILE" ]]; then
+  echo "Missing env file: $ENV_FILE"
+  exit 1
+fi
+
 docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" down