From 6795fedbfef8f062ca9894a6ba4a82e262a44c0a Mon Sep 17 00:00:00 2001 From: "mula.liu" Date: Thu, 12 Mar 2026 01:20:57 +0800 Subject: [PATCH] v0.1.4-p1 --- .env.prod.example | 12 + README.md | 3 + backend/.env.example | 12 + backend/Dockerfile | 3 + backend/core/settings.py | 21 ++ backend/core/speech_service.py | 259 +++++++++++++++++ backend/main.py | 119 +++++++- backend/requirements.txt | 2 + docker-compose.prod.yml | 10 + frontend/src/i18n/dashboard.en.ts | 14 +- frontend/src/i18n/dashboard.zh-cn.ts | 14 +- .../modules/dashboard/BotDashboardModule.css | 131 ++++++++- .../modules/dashboard/BotDashboardModule.tsx | 268 +++++++++++++++++- scripts/deploy-prod.sh | 1 + scripts/stop-prod.sh | 5 + 15 files changed, 853 insertions(+), 21 deletions(-) create mode 100644 backend/core/speech_service.py diff --git a/.env.prod.example b/.env.prod.example index 7d64a0d..a11a33c 100644 --- a/.env.prod.example +++ b/.env.prod.example @@ -43,3 +43,15 @@ PANEL_ACCESS_PASSWORD=change_me_panel_password # Max upload size for backend validation (MB) UPLOAD_MAX_MB=200 + +# Local speech-to-text (Whisper via whisper.cpp model file) +STT_ENABLED=true +STT_MODEL=ggml-small-q8_0.bin +STT_MODEL_DIR=${HOST_DATA_ROOT}/model +STT_DEVICE=cpu +STT_MAX_AUDIO_SECONDS=20 +STT_DEFAULT_LANGUAGE=zh +STT_FORCE_SIMPLIFIED=true +STT_AUDIO_PREPROCESS=true +STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20 +STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。 diff --git a/README.md b/README.md index 76305c2..4ecaa53 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,8 @@ graph TD - 配置绝对路径: - `HOST_DATA_ROOT` - `HOST_BOTS_WORKSPACE_ROOT` + - 如启用本地语音识别,请将 Whisper `.bin` 模型文件放到 `${HOST_DATA_ROOT}/model/` + 并让 `STT_MODEL` 指向完整文件名,例如 `ggml-small-q8_0.bin` - 中国网络建议配置加速项: - `PIP_INDEX_URL`、`PIP_TRUSTED_HOST` - `NPM_REGISTRY` @@ -120,3 +122,4 @@ graph TD - 必须挂载 `/var/run/docker.sock`,否则后端无法操作 Bot 镜像与容器。 - `HOST_BOTS_WORKSPACE_ROOT` 必须是宿主机绝对路径,并且在 `docker-compose.prod.yml` 中以“同路径”挂载到后端容器。 原因:后端通过 Docker API 创建 Bot 容器时,使用的是宿主机可见的 bind 路径。 +- 语音识别当前基于 `pywhispercpp==1.3.1` + Whisper `.bin` 模型文件,不使用 `faster-whisper`。 diff --git a/backend/.env.example b/backend/.env.example index 2c691f8..2728948 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -27,6 +27,18 @@ PANEL_ACCESS_PASSWORD= # Max upload size for backend validation (MB) UPLOAD_MAX_MB=100 +# Local speech-to-text (Whisper via whisper.cpp model file) +STT_ENABLED=true +STT_MODEL=ggml-small-q8_0.bin +STT_MODEL_DIR=../data/model +STT_DEVICE=cpu +STT_MAX_AUDIO_SECONDS=20 +STT_DEFAULT_LANGUAGE=zh +STT_FORCE_SIMPLIFIED=true +STT_AUDIO_PREPROCESS=true +STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20 +STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。 + # Local backend server options (for `python3 main.py`) APP_HOST=0.0.0.0 APP_PORT=8000 diff --git a/backend/Dockerfile b/backend/Dockerfile index 788e247..a96c58c 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -13,6 +13,9 @@ ARG PIP_TRUSTED_HOST= COPY backend/requirements.txt ./requirements.txt RUN if [ -n "${PIP_INDEX_URL}" ]; then pip config set global.index-url "${PIP_INDEX_URL}"; fi \ && if [ -n "${PIP_TRUSTED_HOST}" ]; then pip config set global.trusted-host "${PIP_TRUSTED_HOST}"; fi \ + && apt-get update \ + && apt-get install -y --no-install-recommends ffmpeg \ + && rm -rf /var/lib/apt/lists/* \ && pip install --upgrade pip \ && pip install -r requirements.txt diff --git a/backend/core/settings.py b/backend/core/settings.py index 58bd1cc..cf98ecd 100644 --- a/backend/core/settings.py +++ b/backend/core/settings.py @@ -48,6 +48,7 @@ def _normalize_dir_path(path_value: str) -> str: raw = str(path_value or "").strip() if not raw: return raw + raw = os.path.expandvars(os.path.expanduser(raw)) p = Path(raw) if p.is_absolute(): return str(p) @@ -117,6 +118,26 @@ DATABASE_ENGINE: Final[str] = _database_engine(DATABASE_URL) DATABASE_URL_DISPLAY: Final[str] = _mask_database_url(DATABASE_URL) DATABASE_ECHO: Final[bool] = _env_bool("DATABASE_ECHO", True) UPLOAD_MAX_MB: Final[int] = _env_int("UPLOAD_MAX_MB", 100, 1, 2048) +STT_ENABLED: Final[bool] = _env_bool("STT_ENABLED", True) +STT_MODEL: Final[str] = str(os.getenv("STT_MODEL") or "ggml-small-q8_0.bin").strip() +_DEFAULT_STT_MODEL_DIR: Final[Path] = (Path(DATA_ROOT) / "model").resolve() +_configured_stt_model_dir = _normalize_dir_path(os.getenv("STT_MODEL_DIR", str(_DEFAULT_STT_MODEL_DIR))) +if _configured_stt_model_dir and not Path(_configured_stt_model_dir).exists() and _DEFAULT_STT_MODEL_DIR.exists(): + STT_MODEL_DIR: Final[str] = str(_DEFAULT_STT_MODEL_DIR) +else: + STT_MODEL_DIR: Final[str] = _configured_stt_model_dir +STT_DEVICE: Final[str] = str(os.getenv("STT_DEVICE") or "cpu").strip().lower() or "cpu" +STT_MAX_AUDIO_SECONDS: Final[int] = _env_int("STT_MAX_AUDIO_SECONDS", 20, 5, 600) +STT_DEFAULT_LANGUAGE: Final[str] = str(os.getenv("STT_DEFAULT_LANGUAGE") or "zh").strip().lower() or "zh" +STT_FORCE_SIMPLIFIED: Final[bool] = _env_bool("STT_FORCE_SIMPLIFIED", True) +STT_AUDIO_PREPROCESS: Final[bool] = _env_bool("STT_AUDIO_PREPROCESS", True) +STT_AUDIO_FILTER: Final[str] = str( + os.getenv("STT_AUDIO_FILTER") or "highpass=f=120,lowpass=f=7600,afftdn=nf=-20" +).strip() +STT_INITIAL_PROMPT: Final[str] = str( + os.getenv("STT_INITIAL_PROMPT") + or "以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。" +).strip() REDIS_ENABLED: Final[bool] = _env_bool("REDIS_ENABLED", False) REDIS_URL: Final[str] = str(os.getenv("REDIS_URL") or "").strip() diff --git a/backend/core/speech_service.py b/backend/core/speech_service.py new file mode 100644 index 0000000..177d833 --- /dev/null +++ b/backend/core/speech_service.py @@ -0,0 +1,259 @@ +from __future__ import annotations + +import os +import shutil +import subprocess +import tempfile +import threading +from pathlib import Path +from typing import Any, Dict, Optional + +from core.settings import ( + STT_AUDIO_FILTER, + STT_AUDIO_PREPROCESS, + STT_DEVICE, + STT_ENABLED, + STT_FORCE_SIMPLIFIED, + STT_INITIAL_PROMPT, + STT_MAX_AUDIO_SECONDS, + STT_MODEL, + STT_MODEL_DIR, +) + + +class SpeechServiceError(RuntimeError): + pass + + +class SpeechDisabledError(SpeechServiceError): + pass + + +class SpeechDurationError(SpeechServiceError): + pass + + +class WhisperSpeechService: + def __init__(self) -> None: + self._model: Any = None + self._model_source: str = "" + self._backend: str = "" + self._model_lock = threading.Lock() + + def _resolve_model_source(self) -> str: + model = str(STT_MODEL or "").strip() + model_dir = str(STT_MODEL_DIR or "").strip() + + if not model: + raise SpeechServiceError( + "STT_MODEL is empty. Please set the full model file name, e.g. ggml-samll-q8_0.bin." + ) + + # If STT_MODEL itself is an absolute/relative path, use it directly. + if any(sep in model for sep in ("/", "\\")): + direct = Path(model).expanduser() + if not direct.exists() or not direct.is_file(): + raise SpeechServiceError(f"STT model file not found: {direct}") + if direct.suffix.lower() != ".bin": + raise SpeechServiceError( + "STT_MODEL must point to a whisper.cpp ggml .bin model file." + ) + return str(direct.resolve()) + + # Strict mode: only exact filename, no alias/auto detection. + if Path(model).suffix.lower() != ".bin": + raise SpeechServiceError( + "STT_MODEL must be the exact model file name (with .bin), e.g. ggml-small-q8_0.bin." + ) + + if not model_dir: + raise SpeechServiceError("STT_MODEL_DIR is empty.") + root = Path(model_dir).expanduser() + if not root.exists() or not root.is_dir(): + raise SpeechServiceError(f"STT_MODEL_DIR does not exist: {root}") + candidate = root / model + if not candidate.exists() or not candidate.is_file(): + raise SpeechServiceError( + f"STT model file not found under STT_MODEL_DIR: {candidate}" + ) + return str(candidate.resolve()) + + def _load_model(self) -> Any: + model_source = self._resolve_model_source() + if self._model is not None and self._model_source == model_source: + return self._model + with self._model_lock: + if self._model is not None and self._model_source == model_source: + return self._model + try: + from pywhispercpp.model import Model # type: ignore + except Exception as exc: + raise SpeechServiceError( + "pywhispercpp is not installed in the active backend environment. " + "Run pip install -r backend/requirements.txt or rebuild the backend image." + ) from exc + self._model = Model( + model_source, + print_realtime=False, + print_progress=False, + ) + self._backend = "pywhispercpp" + self._model_source = model_source + return self._model + + @staticmethod + def _preprocess_audio(file_path: str) -> str: + target = str(file_path or "").strip() + if not STT_AUDIO_PREPROCESS or not target or not os.path.isfile(target): + return target + if shutil.which("ffmpeg") is None: + return target + tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav", prefix=".speech_clean_") + tmp_path = tmp.name + tmp.close() + cmd = [ + "ffmpeg", + "-y", + "-i", + target, + "-vn", + "-ac", + "1", + "-ar", + "16000", + ] + audio_filter = str(STT_AUDIO_FILTER or "").strip() + if audio_filter: + cmd.extend(["-af", audio_filter]) + cmd.extend(["-c:a", "pcm_s16le", tmp_path]) + try: + completed = subprocess.run( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=False, + ) + if completed.returncode != 0 or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0: + if os.path.exists(tmp_path): + os.remove(tmp_path) + return target + return tmp_path + except Exception: + if os.path.exists(tmp_path): + os.remove(tmp_path) + return target + + @staticmethod + def _probe_audio_duration_seconds(file_path: str) -> Optional[float]: + try: + import av # type: ignore + + with av.open(file_path) as container: + if container.duration is not None: + # container.duration is in av.time_base units. + return max(0.0, float(container.duration / av.time_base)) + for stream in container.streams: + if stream.type != "audio": + continue + if stream.duration is not None and stream.time_base is not None: + return max(0.0, float(stream.duration * stream.time_base)) + except Exception: + return None + return None + + @staticmethod + def _normalize_text(text: str) -> str: + content = str(text or "").strip() + if not content or not STT_FORCE_SIMPLIFIED: + return content + try: + from opencc_purepy import OpenCC # type: ignore + + return str(OpenCC("t2s").convert(content) or "").strip() or content + except Exception: + return content + + @staticmethod + def _filter_supported_transcribe_kwargs(model: Any, kwargs: Dict[str, Any]) -> Dict[str, Any]: + try: + available = set(model.get_params().keys()) + except Exception: + return kwargs + return {key: value for key, value in kwargs.items() if key in available} + + def transcribe_file(self, file_path: str, language: Optional[str] = None) -> Dict[str, Any]: + if not STT_ENABLED: + raise SpeechDisabledError("Speech-to-text is disabled") + target = str(file_path or "").strip() + if not target or not os.path.isfile(target): + raise SpeechServiceError("Audio file not found") + + duration_seconds = self._probe_audio_duration_seconds(target) + if duration_seconds is not None and duration_seconds > float(STT_MAX_AUDIO_SECONDS) + 0.3: + raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds") + + prepared_target = self._preprocess_audio(target) + try: + model = self._load_model() + lang = str(language or "").strip().lower() + normalized_lang: Optional[str] = None + if lang and lang not in {"auto", "null", "none"}: + normalized_lang = lang + + max_end = 0.0 + detected_language = "" + texts = [] + kwargs: Dict[str, Any] = { + "print_realtime": False, + "print_progress": False, + "no_context": True, + "suppress_non_speech_tokens": True, + } + if normalized_lang: + kwargs["language"] = normalized_lang + initial_prompt = str(STT_INITIAL_PROMPT or "").strip() + if initial_prompt: + kwargs["initial_prompt"] = initial_prompt + kwargs = self._filter_supported_transcribe_kwargs(model, kwargs) + try: + segments = model.transcribe(prepared_target, **kwargs) + except Exception as exc: + raise SpeechServiceError( + f"pywhispercpp transcription failed: {exc}. " + "If input is not wav, install ffmpeg in runtime image." + ) from exc + for segment in segments: + txt = str(getattr(segment, "text", "") or "").strip() + if txt: + texts.append(txt) + if normalized_lang: + detected_language = normalized_lang + try: + max_end = max(max_end, float(getattr(segment, "t1", 0.0) or 0.0) / 100.0) + except Exception: + pass + if max_end > float(STT_MAX_AUDIO_SECONDS) + 0.3: + raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds") + + text = self._normalize_text(" ".join(texts).strip()) + if not text: + raise SpeechServiceError("No speech detected") + + if duration_seconds is None: + duration_seconds = max_end if max_end > 0 else None + + return { + "text": text, + "language": detected_language or None, + "duration_seconds": duration_seconds, + "max_audio_seconds": STT_MAX_AUDIO_SECONDS, + "model": STT_MODEL, + "device": STT_DEVICE, + "backend": self._backend or "unknown", + } + finally: + if prepared_target != target and os.path.exists(prepared_target): + try: + os.remove(prepared_target) + except Exception: + pass diff --git a/backend/main.py b/backend/main.py index f991041..d0484e3 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,5 +1,6 @@ import asyncio import json +import logging import mimetypes import os import re @@ -12,7 +13,7 @@ from urllib.parse import unquote import httpx from pydantic import BaseModel -from fastapi import Depends, FastAPI, File, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect +from fastapi import Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect from fastapi.responses import FileResponse, JSONResponse, StreamingResponse from fastapi.middleware.cors import CORSMiddleware from sqlmodel import Session, select @@ -21,6 +22,12 @@ from core.config_manager import BotConfigManager from core.cache import cache from core.database import engine, get_session, init_database from core.docker_manager import BotDockerManager +from core.speech_service import ( + SpeechDisabledError, + SpeechDurationError, + SpeechServiceError, + WhisperSpeechService, +) from core.settings import ( BOTS_WORKSPACE_ROOT, DATA_ROOT, @@ -37,11 +44,17 @@ from core.settings import ( REDIS_ENABLED, REDIS_PREFIX, REDIS_URL, + STT_DEVICE, + STT_DEFAULT_LANGUAGE, + STT_ENABLED, + STT_MAX_AUDIO_SECONDS, + STT_MODEL, UPLOAD_MAX_MB, ) from models.bot import BotInstance, BotMessage, NanobotImage app = FastAPI(title="Dashboard Nanobot API") +logger = logging.getLogger("dashboard.backend") app.add_middleware( CORSMiddleware, @@ -55,6 +68,7 @@ os.makedirs(DATA_ROOT, exist_ok=True) docker_manager = BotDockerManager(host_data_root=BOTS_WORKSPACE_ROOT) config_manager = BotConfigManager(host_data_root=BOTS_WORKSPACE_ROOT) +speech_service = WhisperSpeechService() BOT_ID_PATTERN = re.compile(r"^[A-Za-z0-9_]+$") @@ -501,6 +515,13 @@ def get_system_defaults(): "limits": { "upload_max_mb": UPLOAD_MAX_MB, }, + "speech": { + "enabled": STT_ENABLED, + "model": STT_MODEL, + "device": STT_DEVICE, + "max_audio_seconds": STT_MAX_AUDIO_SECONDS, + "default_language": STT_DEFAULT_LANGUAGE, + }, } @@ -3117,6 +3138,102 @@ async def upload_workspace_files( return {"bot_id": bot_id, "files": rows} +@app.post("/api/bots/{bot_id}/speech/transcribe") +async def transcribe_bot_speech( + bot_id: str, + file: UploadFile = File(...), + language: Optional[str] = Form(None), + session: Session = Depends(get_session), +): + bot = session.get(BotInstance, bot_id) + if not bot: + raise HTTPException(status_code=404, detail="Bot not found") + if not STT_ENABLED: + raise HTTPException(status_code=400, detail="Speech recognition is disabled") + if not file: + raise HTTPException(status_code=400, detail="no audio file uploaded") + + original_name = str(file.filename or "audio.webm").strip() or "audio.webm" + safe_name = os.path.basename(original_name).replace("\\", "_").replace("/", "_") + ext = os.path.splitext(safe_name)[1].strip().lower() or ".webm" + if len(ext) > 12: + ext = ".webm" + + tmp_path = "" + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix=".speech_", dir=DATA_ROOT) as tmp: + tmp_path = tmp.name + while True: + chunk = await file.read(1024 * 1024) + if not chunk: + break + tmp.write(chunk) + + if not tmp_path or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0: + raise HTTPException(status_code=400, detail="audio payload is empty") + + resolved_language = str(language or "").strip() or STT_DEFAULT_LANGUAGE + result = await asyncio.to_thread(speech_service.transcribe_file, tmp_path, resolved_language) + text = str(result.get("text") or "").strip() + if not text: + raise HTTPException(status_code=400, detail="No speech detected") + return { + "bot_id": bot_id, + "text": text, + "duration_seconds": result.get("duration_seconds"), + "max_audio_seconds": STT_MAX_AUDIO_SECONDS, + "model": STT_MODEL, + "device": STT_DEVICE, + "language": result.get("language") or resolved_language, + } + except SpeechDisabledError as exc: + logger.warning( + "speech transcribe disabled bot_id=%s file=%s language=%s detail=%s", + bot_id, + safe_name, + language, + exc, + ) + raise HTTPException(status_code=400, detail=str(exc)) + except SpeechDurationError: + logger.warning( + "speech transcribe too long bot_id=%s file=%s language=%s max_seconds=%s", + bot_id, + safe_name, + language, + STT_MAX_AUDIO_SECONDS, + ) + raise HTTPException(status_code=413, detail=f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds") + except SpeechServiceError as exc: + logger.exception( + "speech transcribe failed bot_id=%s file=%s language=%s", + bot_id, + safe_name, + language, + ) + raise HTTPException(status_code=400, detail=str(exc)) + except HTTPException: + raise + except Exception as exc: + logger.exception( + "speech transcribe unexpected error bot_id=%s file=%s language=%s", + bot_id, + safe_name, + language, + ) + raise HTTPException(status_code=500, detail=f"speech transcription failed: {exc}") + finally: + try: + await file.close() + except Exception: + pass + if tmp_path and os.path.exists(tmp_path): + try: + os.remove(tmp_path) + except Exception: + pass + + @app.websocket("/ws/monitor/{bot_id}") async def websocket_endpoint(websocket: WebSocket, bot_id: str): with Session(engine) as session: diff --git a/backend/requirements.txt b/backend/requirements.txt index e762f4d..e89a512 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -15,3 +15,5 @@ watchfiles==0.21.0 urllib3==1.26.18 requests==2.31.0 redis==5.0.8 +opencc-purepy==1.1.0 +pywhispercpp==1.3.1 diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index c05a3dc..f715af3 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -24,6 +24,16 @@ services: REDIS_PREFIX: ${REDIS_PREFIX:-dashboard_nanobot} REDIS_DEFAULT_TTL: ${REDIS_DEFAULT_TTL:-60} PANEL_ACCESS_PASSWORD: ${PANEL_ACCESS_PASSWORD:-} + STT_ENABLED: ${STT_ENABLED:-true} + STT_MODEL: ${STT_MODEL:-ggml-small-q8_0.bin} + STT_MODEL_DIR: ${STT_MODEL_DIR:-${HOST_DATA_ROOT}/model} + STT_DEVICE: ${STT_DEVICE:-cpu} + STT_MAX_AUDIO_SECONDS: ${STT_MAX_AUDIO_SECONDS:-20} + STT_DEFAULT_LANGUAGE: ${STT_DEFAULT_LANGUAGE:-zh} + STT_FORCE_SIMPLIFIED: ${STT_FORCE_SIMPLIFIED:-true} + STT_AUDIO_PREPROCESS: ${STT_AUDIO_PREPROCESS:-true} + STT_AUDIO_FILTER: ${STT_AUDIO_FILTER:-highpass=f=120,lowpass=f=7600,afftdn=nf=-20} + STT_INITIAL_PROMPT: ${STT_INITIAL_PROMPT:-以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。} volumes: - /var/run/docker.sock:/var/run/docker.sock - ${HOST_DATA_ROOT}:${HOST_DATA_ROOT} diff --git a/frontend/src/i18n/dashboard.en.ts b/frontend/src/i18n/dashboard.en.ts index a7fa71f..e8aa3eb 100644 --- a/frontend/src/i18n/dashboard.en.ts +++ b/frontend/src/i18n/dashboard.en.ts @@ -28,7 +28,19 @@ export const dashboardEn = { copyPromptFail: 'Failed to copy prompt.', editPromptDone: 'Inserted into composer.', voiceInput: 'Voice input', - voiceUnavailable: 'Voice input is not available yet.', + textInput: 'Text input', + voiceUnavailable: 'Speech recognition is disabled.', + voiceUnsupported: 'Your browser does not support audio recording.', + voicePermissionDenied: 'Microphone permission denied. Please allow access in browser settings.', + voiceRecordFail: 'Audio recording failed. Please retry.', + voiceReady: 'Click the mic to start recording', + voiceRecording: 'Recording...', + voiceTranscribing: 'Transcribing...', + voiceStart: 'Start recording', + voiceStop: 'Stop recording', + voiceTranscribeDone: 'Voice converted to text.', + voiceTranscribeEmpty: 'No valid speech detected.', + voiceTranscribeFail: 'Speech transcription failed.', copyReply: 'Copy reply', copyReplyDone: 'Reply copied.', copyReplyFail: 'Failed to copy reply.', diff --git a/frontend/src/i18n/dashboard.zh-cn.ts b/frontend/src/i18n/dashboard.zh-cn.ts index c219e6e..71be317 100644 --- a/frontend/src/i18n/dashboard.zh-cn.ts +++ b/frontend/src/i18n/dashboard.zh-cn.ts @@ -28,7 +28,19 @@ export const dashboardZhCn = { copyPromptFail: '复制指令失败。', editPromptDone: '已填入输入框。', voiceInput: '语音输入', - voiceUnavailable: '语音输入暂未接入。', + textInput: '文字输入', + voiceUnavailable: '语音识别未启用。', + voiceUnsupported: '当前浏览器不支持录音。', + voicePermissionDenied: '麦克风权限被拒绝,请在浏览器设置中允许访问。', + voiceRecordFail: '录音失败,请重试。', + voiceReady: '点击麦克风开始录音', + voiceRecording: '录音中...', + voiceTranscribing: '语音识别中...', + voiceStart: '开始录音', + voiceStop: '停止录音', + voiceTranscribeDone: '语音已转为文本。', + voiceTranscribeEmpty: '未识别到有效语音内容。', + voiceTranscribeFail: '语音识别失败。', copyReply: '复制回复', copyReplyDone: '回复已复制。', copyReplyFail: '复制回复失败。', diff --git a/frontend/src/modules/dashboard/BotDashboardModule.css b/frontend/src/modules/dashboard/BotDashboardModule.css index 7dec514..b749511 100644 --- a/frontend/src/modules/dashboard/BotDashboardModule.css +++ b/frontend/src/modules/dashboard/BotDashboardModule.css @@ -1085,17 +1085,126 @@ padding: 14px 120px 42px 14px; } +.ops-voice-panel { + min-height: 96px; + border: 1px dashed color-mix(in oklab, var(--line) 72%, var(--brand) 28%); + border-radius: 12px; + background: color-mix(in oklab, var(--panel) 78%, var(--panel-soft) 22%); + padding: 12px 14px 12px 14px; + display: grid; + align-content: center; + gap: 10px; +} + +.ops-voice-title { + font-size: 13px; + font-weight: 700; + color: var(--muted); +} + +.ops-voice-wave { + height: 28px; + border-radius: 999px; + border: 1px solid color-mix(in oklab, var(--line) 76%, transparent); + background: color-mix(in oklab, var(--panel-soft) 78%, var(--panel) 22%); + display: flex; + align-items: center; + gap: 8px; + padding: 0 6px; + overflow: hidden; + flex: 1 1 auto; + min-width: 0; +} + +.ops-voice-wave-segment { + height: 100%; + min-width: 0; + display: flex; + align-items: center; + justify-content: space-between; + gap: 2px; + padding: 0 6px; + border-radius: 999px; + background: color-mix(in oklab, var(--panel) 60%, rgba(255, 255, 255, 0.18) 40%); +} + +.ops-voice-wave.is-mobile .ops-voice-wave-segment { + flex: 1 1 auto; +} + +.ops-voice-wave.is-desktop .ops-voice-wave-segment { + flex: 1 1 0; +} + +.ops-voice-wave-segment i { + display: inline-block; + width: 3px; + min-width: 3px; + height: 10px; + border-radius: 999px; + background: color-mix(in oklab, var(--line) 72%, var(--text) 28%); + opacity: 0.72; +} + +.ops-voice-wave-segment i:nth-child(3n) { + height: 14px; +} + +.ops-voice-wave-segment i:nth-child(4n) { + height: 18px; +} + +.ops-voice-wave-segment i:nth-child(5n) { + height: 12px; +} + +.ops-voice-wave.is-live .ops-voice-wave-segment i { + background: color-mix(in oklab, var(--brand) 60%, #8ec3ff 40%); + animation: ops-voice-wave 1.05s ease-in-out infinite; +} + +.ops-voice-countdown { + flex: 0 0 auto; + font-size: 13px; + font-weight: 700; + color: var(--title); + min-width: 44px; + text-align: right; +} + .ops-composer-tools-right { position: absolute; bottom: 14px; - display: inline-flex; + left: 12px; + right: 12px; + display: flex; align-items: center; + justify-content: flex-end; gap: 6px; } .ops-composer-tools-right { - right: 12px; - max-width: calc(100% - 24px); + width: auto; +} + +.ops-voice-inline { + min-width: 0; + flex: 1 1 auto; + display: flex; + align-items: center; + gap: 8px; + margin-right: 4px; +} + +@media (max-width: 720px) { + .ops-voice-wave { + gap: 4px; + padding: 0 4px; + } + + .ops-voice-wave-segment { + padding: 0 4px; + } } .ops-composer-inline-btn { @@ -1116,6 +1225,11 @@ color: var(--icon); } +.ops-composer-inline-btn.is-active { + background: color-mix(in oklab, var(--brand-soft) 42%, var(--panel) 58%); + color: var(--brand); +} + .ops-composer-submit-btn { width: 34px; height: 34px; @@ -1224,6 +1338,17 @@ 100% { transform: translateX(430%); } } +@keyframes ops-voice-wave { + 0%, 100% { + transform: scaleY(0.55); + opacity: 0.35; + } + 50% { + transform: scaleY(1.95); + opacity: 1; + } +} + .ops-pending-chip { display: inline-flex; align-items: center; diff --git a/frontend/src/modules/dashboard/BotDashboardModule.tsx b/frontend/src/modules/dashboard/BotDashboardModule.tsx index dab392c..4f4daa5 100644 --- a/frontend/src/modules/dashboard/BotDashboardModule.tsx +++ b/frontend/src/modules/dashboard/BotDashboardModule.tsx @@ -249,6 +249,12 @@ interface SystemDefaultsResponse { limits?: { upload_max_mb?: number; }; + speech?: { + enabled?: boolean; + model?: string; + device?: string; + max_audio_seconds?: number; + }; } type BotEnvParams = Record; @@ -719,6 +725,11 @@ export function BotDashboardModule({ const fileNotPreviewableLabel = locale === 'zh' ? '当前文件类型不支持预览' : 'This file type is not previewable'; const [selectedBotId, setSelectedBotId] = useState(''); const [command, setCommand] = useState(''); + const [speechEnabled, setSpeechEnabled] = useState(true); + const [voiceMaxSeconds, setVoiceMaxSeconds] = useState(20); + const [isVoiceRecording, setIsVoiceRecording] = useState(false); + const [isVoiceTranscribing, setIsVoiceTranscribing] = useState(false); + const [voiceCountdown, setVoiceCountdown] = useState(20); const [isSaving, setIsSaving] = useState(false); const [showBaseModal, setShowBaseModal] = useState(false); const [showParamModal, setShowParamModal] = useState(false); @@ -798,6 +809,10 @@ export function BotDashboardModule({ const [feedbackSavingByMessageId, setFeedbackSavingByMessageId] = useState>({}); const [showRuntimeActionModal, setShowRuntimeActionModal] = useState(false); const [workspaceHoverCard, setWorkspaceHoverCard] = useState(null); + const voiceRecorderRef = useRef(null); + const voiceStreamRef = useRef(null); + const voiceChunksRef = useRef([]); + const voiceTimerRef = useRef(null); const runtimeMenuRef = useRef(null); const botOrderRef = useRef>({}); const nextBotOrderRef = useRef(1); @@ -1544,16 +1559,36 @@ export function BotDashboardModule({ persistComposerDraft(selectedBotId, command, pendingAttachments); }, [selectedBotId, composerDraftHydrated, command, pendingAttachments]); + useEffect(() => { + return () => { + clearVoiceTimer(); + try { + if (voiceRecorderRef.current && voiceRecorderRef.current.state !== 'inactive') { + voiceRecorderRef.current.stop(); + } + } catch { + // ignore + } + releaseVoiceStream(); + }; + }, []); + + useEffect(() => { + if (!isVoiceRecording && !isVoiceTranscribing) { + setVoiceCountdown(voiceMaxSeconds); + } + }, [voiceMaxSeconds, isVoiceRecording, isVoiceTranscribing]); + useEffect(() => { const hasDraft = Boolean(String(command || '').trim()) || pendingAttachments.length > 0 || Boolean(quotedReply); - if (!hasDraft && !isUploadingAttachments) return; + if (!hasDraft && !isUploadingAttachments && !isVoiceRecording && !isVoiceTranscribing) return; const onBeforeUnload = (event: BeforeUnloadEvent) => { event.preventDefault(); event.returnValue = ''; }; window.addEventListener('beforeunload', onBeforeUnload); return () => window.removeEventListener('beforeunload', onBeforeUnload); - }, [command, pendingAttachments.length, quotedReply, isUploadingAttachments]); + }, [command, pendingAttachments.length, quotedReply, isUploadingAttachments, isVoiceRecording, isVoiceTranscribing]); const syncChatScrollToBottom = useCallback((behavior: ScrollBehavior = 'auto') => { const box = chatScrollRef.current; @@ -1580,6 +1615,9 @@ export function BotDashboardModule({ useEffect(() => { setQuotedReply(null); + if (isVoiceRecording) { + stopVoiceRecording(); + } }, [selectedBotId]); useEffect(() => { @@ -1637,9 +1675,21 @@ export function BotDashboardModule({ const loadSystemDefaults = async () => { try { const res = await axios.get(`${APP_ENDPOINTS.apiBase}/system/defaults`); + if (!alive) return; const configured = Number(res.data?.limits?.upload_max_mb); - if (!Number.isFinite(configured) || configured <= 0 || !alive) return; - setUploadMaxMb(Math.max(1, Math.floor(configured))); + if (Number.isFinite(configured) && configured > 0) { + setUploadMaxMb(Math.max(1, Math.floor(configured))); + } + const speechEnabledRaw = res.data?.speech?.enabled; + if (typeof speechEnabledRaw === 'boolean') { + setSpeechEnabled(speechEnabledRaw); + } + const speechSeconds = Number(res.data?.speech?.max_audio_seconds); + if (Number.isFinite(speechSeconds) && speechSeconds > 0) { + const normalized = Math.max(5, Math.floor(speechSeconds)); + setVoiceMaxSeconds(normalized); + setVoiceCountdown(normalized); + } } catch { // keep default limit } @@ -2642,8 +2692,155 @@ export function BotDashboardModule({ filePickerRef.current?.click(); }; + const clearVoiceTimer = () => { + if (voiceTimerRef.current) { + window.clearInterval(voiceTimerRef.current); + voiceTimerRef.current = null; + } + }; + + const releaseVoiceStream = () => { + if (voiceStreamRef.current) { + voiceStreamRef.current.getTracks().forEach((track) => { + try { + track.stop(); + } catch { + // ignore + } + }); + voiceStreamRef.current = null; + } + }; + + const transcribeVoiceBlob = async (blob: Blob) => { + if (!selectedBot || blob.size <= 0) return; + setIsVoiceTranscribing(true); + try { + const mime = String(blob.type || '').toLowerCase(); + const ext = mime.includes('ogg') ? 'ogg' : mime.includes('mp4') ? 'mp4' : 'webm'; + const file = new File([blob], `voice-input-${Date.now()}.${ext}`, { type: blob.type || 'audio/webm' }); + const formData = new FormData(); + formData.append('file', file); + formData.append('language', 'zh'); + const res = await axios.post<{ text?: string }>( + `${APP_ENDPOINTS.apiBase}/bots/${selectedBot.id}/speech/transcribe`, + formData, + { timeout: 120000 }, + ); + const text = normalizeUserMessageText(String(res.data?.text || '')); + if (!text) { + notify(t.voiceTranscribeEmpty, { tone: 'warning' }); + return; + } + setCommand((prev) => { + const base = String(prev || '').trim(); + if (!base) return text; + return `${base}\n${text}`; + }); + window.requestAnimationFrame(() => composerTextareaRef.current?.focus()); + notify(t.voiceTranscribeDone, { tone: 'success' }); + } catch (error: any) { + const msg = String(error?.response?.data?.detail || '').trim(); + console.error('Speech transcription failed', { + botId: selectedBot.id, + message: msg || t.voiceTranscribeFail, + status: error?.response?.status, + response: error?.response?.data, + error, + }); + notify(msg || t.voiceTranscribeFail, { tone: 'error' }); + } finally { + setIsVoiceTranscribing(false); + } + }; + + const stopVoiceRecording = () => { + const recorder = voiceRecorderRef.current; + if (!recorder || recorder.state === 'inactive') return; + try { + recorder.stop(); + } catch { + // ignore + } + }; + + const startVoiceRecording = async () => { + if (!selectedBot || !canChat || isVoiceTranscribing) return; + if (!speechEnabled) { + notify(t.voiceUnavailable, { tone: 'warning' }); + return; + } + if (typeof window === 'undefined' || typeof navigator === 'undefined' || !navigator.mediaDevices?.getUserMedia) { + notify(t.voiceUnsupported, { tone: 'error' }); + return; + } + if (typeof MediaRecorder === 'undefined') { + notify(t.voiceUnsupported, { tone: 'error' }); + return; + } + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const mimeCandidates = ['audio/webm;codecs=opus', 'audio/webm', 'audio/ogg;codecs=opus', 'audio/mp4']; + const supportedMime = mimeCandidates.find((candidate) => MediaRecorder.isTypeSupported(candidate)); + const recorder = supportedMime + ? new MediaRecorder(stream, { mimeType: supportedMime }) + : new MediaRecorder(stream); + voiceStreamRef.current = stream; + voiceRecorderRef.current = recorder; + voiceChunksRef.current = []; + setVoiceCountdown(voiceMaxSeconds); + setIsVoiceRecording(true); + + recorder.ondataavailable = (event: BlobEvent) => { + if (event.data && event.data.size > 0) { + voiceChunksRef.current.push(event.data); + } + }; + recorder.onerror = () => { + setIsVoiceRecording(false); + clearVoiceTimer(); + releaseVoiceStream(); + notify(t.voiceRecordFail, { tone: 'error' }); + }; + recorder.onstop = () => { + const blob = new Blob(voiceChunksRef.current, { type: supportedMime || recorder.mimeType || 'audio/webm' }); + voiceRecorderRef.current = null; + voiceChunksRef.current = []; + clearVoiceTimer(); + releaseVoiceStream(); + setIsVoiceRecording(false); + setVoiceCountdown(voiceMaxSeconds); + if (blob.size > 0) { + void transcribeVoiceBlob(blob); + } + }; + + recorder.start(200); + clearVoiceTimer(); + voiceTimerRef.current = window.setInterval(() => { + setVoiceCountdown((prev) => { + if (prev <= 1) { + stopVoiceRecording(); + return 0; + } + return prev - 1; + }); + }, 1000); + } catch { + releaseVoiceStream(); + setIsVoiceRecording(false); + clearVoiceTimer(); + notify(t.voicePermissionDenied, { tone: 'error' }); + } + }; + const onVoiceInput = () => { - notify(t.voiceUnavailable, { tone: 'warning' }); + if (isVoiceTranscribing) return; + if (isVoiceRecording) { + stopVoiceRecording(); + return; + } + void startVoiceRecording(); }; const onPickAttachments = async (event: ChangeEvent) => { @@ -3393,7 +3590,7 @@ export function BotDashboardModule({ value={command} onChange={(e) => setCommand(e.target.value)} onKeyDown={onComposerKeyDown} - disabled={!canChat} + disabled={!canChat || isVoiceRecording || isVoiceTranscribing} placeholder={ canChat ? t.inputPlaceholder @@ -3401,18 +3598,54 @@ export function BotDashboardModule({ } />
- +
+ {Array.from({ length: isCompactMobile ? 1 : 5 }).map((_, segmentIdx) => ( +
+ {Array.from({ length: isCompactMobile ? 28 : 18 }).map((_, idx) => { + const delayIndex = isCompactMobile + ? idx + : (segmentIdx * 18) + idx; + return ( + + ); + })} +
+ ))} +
+
+ {isVoiceRecording ? `${voiceCountdown}s` : t.voiceTranscribing} +
+
+ ) : null} + void (isChatEnabled && (isThinking || isSending) ? interruptExecution() : send())} aria-label={isChatEnabled && (isThinking || isSending) ? t.interrupt : t.send} diff --git a/scripts/deploy-prod.sh b/scripts/deploy-prod.sh index 6c365f7..8c992aa 100755 --- a/scripts/deploy-prod.sh +++ b/scripts/deploy-prod.sh @@ -11,6 +11,7 @@ if [[ ! -f "$ENV_FILE" ]]; then fi echo "[deploy] using env: $ENV_FILE" +docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" config -q docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" up -d --build echo "[deploy] service status" diff --git a/scripts/stop-prod.sh b/scripts/stop-prod.sh index 2e34202..1fca41f 100755 --- a/scripts/stop-prod.sh +++ b/scripts/stop-prod.sh @@ -4,4 +4,9 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" ENV_FILE="${1:-$ROOT_DIR/.env.prod}" +if [[ ! -f "$ENV_FILE" ]]; then + echo "Missing env file: $ENV_FILE" + exit 1 +fi + docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" down