v0.1.4-p1

main
mula.liu 2026-03-12 01:20:57 +08:00
parent 590eae9f0c
commit 6795fedbfe
15 changed files with 853 additions and 21 deletions

View File

@ -43,3 +43,15 @@ PANEL_ACCESS_PASSWORD=change_me_panel_password
# Max upload size for backend validation (MB)
UPLOAD_MAX_MB=200
# Local speech-to-text (Whisper via whisper.cpp model file)
STT_ENABLED=true
STT_MODEL=ggml-small-q8_0.bin
STT_MODEL_DIR=${HOST_DATA_ROOT}/model
STT_DEVICE=cpu
STT_MAX_AUDIO_SECONDS=20
STT_DEFAULT_LANGUAGE=zh
STT_FORCE_SIMPLIFIED=true
STT_AUDIO_PREPROCESS=true
STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20
STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。

View File

@ -104,6 +104,8 @@ graph TD
- 配置绝对路径:
- `HOST_DATA_ROOT`
- `HOST_BOTS_WORKSPACE_ROOT`
- 如启用本地语音识别,请将 Whisper `.bin` 模型文件放到 `${HOST_DATA_ROOT}/model/`
并让 `STT_MODEL` 指向完整文件名,例如 `ggml-small-q8_0.bin`
- 中国网络建议配置加速项:
- `PIP_INDEX_URL`、`PIP_TRUSTED_HOST`
- `NPM_REGISTRY`
@ -120,3 +122,4 @@ graph TD
- 必须挂载 `/var/run/docker.sock`,否则后端无法操作 Bot 镜像与容器。
- `HOST_BOTS_WORKSPACE_ROOT` 必须是宿主机绝对路径,并且在 `docker-compose.prod.yml` 中以“同路径”挂载到后端容器。
原因:后端通过 Docker API 创建 Bot 容器时,使用的是宿主机可见的 bind 路径。
- 语音识别当前基于 `pywhispercpp==1.3.1` + Whisper `.bin` 模型文件,不使用 `faster-whisper`

View File

@ -27,6 +27,18 @@ PANEL_ACCESS_PASSWORD=
# Max upload size for backend validation (MB)
UPLOAD_MAX_MB=100
# Local speech-to-text (Whisper via whisper.cpp model file)
STT_ENABLED=true
STT_MODEL=ggml-small-q8_0.bin
STT_MODEL_DIR=../data/model
STT_DEVICE=cpu
STT_MAX_AUDIO_SECONDS=20
STT_DEFAULT_LANGUAGE=zh
STT_FORCE_SIMPLIFIED=true
STT_AUDIO_PREPROCESS=true
STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20
STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。
# Local backend server options (for `python3 main.py`)
APP_HOST=0.0.0.0
APP_PORT=8000

View File

@ -13,6 +13,9 @@ ARG PIP_TRUSTED_HOST=
COPY backend/requirements.txt ./requirements.txt
RUN if [ -n "${PIP_INDEX_URL}" ]; then pip config set global.index-url "${PIP_INDEX_URL}"; fi \
&& if [ -n "${PIP_TRUSTED_HOST}" ]; then pip config set global.trusted-host "${PIP_TRUSTED_HOST}"; fi \
&& apt-get update \
&& apt-get install -y --no-install-recommends ffmpeg \
&& rm -rf /var/lib/apt/lists/* \
&& pip install --upgrade pip \
&& pip install -r requirements.txt

View File

@ -48,6 +48,7 @@ def _normalize_dir_path(path_value: str) -> str:
raw = str(path_value or "").strip()
if not raw:
return raw
raw = os.path.expandvars(os.path.expanduser(raw))
p = Path(raw)
if p.is_absolute():
return str(p)
@ -117,6 +118,26 @@ DATABASE_ENGINE: Final[str] = _database_engine(DATABASE_URL)
DATABASE_URL_DISPLAY: Final[str] = _mask_database_url(DATABASE_URL)
DATABASE_ECHO: Final[bool] = _env_bool("DATABASE_ECHO", True)
UPLOAD_MAX_MB: Final[int] = _env_int("UPLOAD_MAX_MB", 100, 1, 2048)
STT_ENABLED: Final[bool] = _env_bool("STT_ENABLED", True)
STT_MODEL: Final[str] = str(os.getenv("STT_MODEL") or "ggml-small-q8_0.bin").strip()
_DEFAULT_STT_MODEL_DIR: Final[Path] = (Path(DATA_ROOT) / "model").resolve()
_configured_stt_model_dir = _normalize_dir_path(os.getenv("STT_MODEL_DIR", str(_DEFAULT_STT_MODEL_DIR)))
if _configured_stt_model_dir and not Path(_configured_stt_model_dir).exists() and _DEFAULT_STT_MODEL_DIR.exists():
STT_MODEL_DIR: Final[str] = str(_DEFAULT_STT_MODEL_DIR)
else:
STT_MODEL_DIR: Final[str] = _configured_stt_model_dir
STT_DEVICE: Final[str] = str(os.getenv("STT_DEVICE") or "cpu").strip().lower() or "cpu"
STT_MAX_AUDIO_SECONDS: Final[int] = _env_int("STT_MAX_AUDIO_SECONDS", 20, 5, 600)
STT_DEFAULT_LANGUAGE: Final[str] = str(os.getenv("STT_DEFAULT_LANGUAGE") or "zh").strip().lower() or "zh"
STT_FORCE_SIMPLIFIED: Final[bool] = _env_bool("STT_FORCE_SIMPLIFIED", True)
STT_AUDIO_PREPROCESS: Final[bool] = _env_bool("STT_AUDIO_PREPROCESS", True)
STT_AUDIO_FILTER: Final[str] = str(
os.getenv("STT_AUDIO_FILTER") or "highpass=f=120,lowpass=f=7600,afftdn=nf=-20"
).strip()
STT_INITIAL_PROMPT: Final[str] = str(
os.getenv("STT_INITIAL_PROMPT")
or "以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。"
).strip()
REDIS_ENABLED: Final[bool] = _env_bool("REDIS_ENABLED", False)
REDIS_URL: Final[str] = str(os.getenv("REDIS_URL") or "").strip()

View File

@ -0,0 +1,259 @@
from __future__ import annotations
import os
import shutil
import subprocess
import tempfile
import threading
from pathlib import Path
from typing import Any, Dict, Optional
from core.settings import (
STT_AUDIO_FILTER,
STT_AUDIO_PREPROCESS,
STT_DEVICE,
STT_ENABLED,
STT_FORCE_SIMPLIFIED,
STT_INITIAL_PROMPT,
STT_MAX_AUDIO_SECONDS,
STT_MODEL,
STT_MODEL_DIR,
)
class SpeechServiceError(RuntimeError):
pass
class SpeechDisabledError(SpeechServiceError):
pass
class SpeechDurationError(SpeechServiceError):
pass
class WhisperSpeechService:
def __init__(self) -> None:
self._model: Any = None
self._model_source: str = ""
self._backend: str = ""
self._model_lock = threading.Lock()
def _resolve_model_source(self) -> str:
model = str(STT_MODEL or "").strip()
model_dir = str(STT_MODEL_DIR or "").strip()
if not model:
raise SpeechServiceError(
"STT_MODEL is empty. Please set the full model file name, e.g. ggml-samll-q8_0.bin."
)
# If STT_MODEL itself is an absolute/relative path, use it directly.
if any(sep in model for sep in ("/", "\\")):
direct = Path(model).expanduser()
if not direct.exists() or not direct.is_file():
raise SpeechServiceError(f"STT model file not found: {direct}")
if direct.suffix.lower() != ".bin":
raise SpeechServiceError(
"STT_MODEL must point to a whisper.cpp ggml .bin model file."
)
return str(direct.resolve())
# Strict mode: only exact filename, no alias/auto detection.
if Path(model).suffix.lower() != ".bin":
raise SpeechServiceError(
"STT_MODEL must be the exact model file name (with .bin), e.g. ggml-small-q8_0.bin."
)
if not model_dir:
raise SpeechServiceError("STT_MODEL_DIR is empty.")
root = Path(model_dir).expanduser()
if not root.exists() or not root.is_dir():
raise SpeechServiceError(f"STT_MODEL_DIR does not exist: {root}")
candidate = root / model
if not candidate.exists() or not candidate.is_file():
raise SpeechServiceError(
f"STT model file not found under STT_MODEL_DIR: {candidate}"
)
return str(candidate.resolve())
def _load_model(self) -> Any:
model_source = self._resolve_model_source()
if self._model is not None and self._model_source == model_source:
return self._model
with self._model_lock:
if self._model is not None and self._model_source == model_source:
return self._model
try:
from pywhispercpp.model import Model # type: ignore
except Exception as exc:
raise SpeechServiceError(
"pywhispercpp is not installed in the active backend environment. "
"Run pip install -r backend/requirements.txt or rebuild the backend image."
) from exc
self._model = Model(
model_source,
print_realtime=False,
print_progress=False,
)
self._backend = "pywhispercpp"
self._model_source = model_source
return self._model
@staticmethod
def _preprocess_audio(file_path: str) -> str:
target = str(file_path or "").strip()
if not STT_AUDIO_PREPROCESS or not target or not os.path.isfile(target):
return target
if shutil.which("ffmpeg") is None:
return target
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav", prefix=".speech_clean_")
tmp_path = tmp.name
tmp.close()
cmd = [
"ffmpeg",
"-y",
"-i",
target,
"-vn",
"-ac",
"1",
"-ar",
"16000",
]
audio_filter = str(STT_AUDIO_FILTER or "").strip()
if audio_filter:
cmd.extend(["-af", audio_filter])
cmd.extend(["-c:a", "pcm_s16le", tmp_path])
try:
completed = subprocess.run(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=False,
)
if completed.returncode != 0 or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
if os.path.exists(tmp_path):
os.remove(tmp_path)
return target
return tmp_path
except Exception:
if os.path.exists(tmp_path):
os.remove(tmp_path)
return target
@staticmethod
def _probe_audio_duration_seconds(file_path: str) -> Optional[float]:
try:
import av # type: ignore
with av.open(file_path) as container:
if container.duration is not None:
# container.duration is in av.time_base units.
return max(0.0, float(container.duration / av.time_base))
for stream in container.streams:
if stream.type != "audio":
continue
if stream.duration is not None and stream.time_base is not None:
return max(0.0, float(stream.duration * stream.time_base))
except Exception:
return None
return None
@staticmethod
def _normalize_text(text: str) -> str:
content = str(text or "").strip()
if not content or not STT_FORCE_SIMPLIFIED:
return content
try:
from opencc_purepy import OpenCC # type: ignore
return str(OpenCC("t2s").convert(content) or "").strip() or content
except Exception:
return content
@staticmethod
def _filter_supported_transcribe_kwargs(model: Any, kwargs: Dict[str, Any]) -> Dict[str, Any]:
try:
available = set(model.get_params().keys())
except Exception:
return kwargs
return {key: value for key, value in kwargs.items() if key in available}
def transcribe_file(self, file_path: str, language: Optional[str] = None) -> Dict[str, Any]:
if not STT_ENABLED:
raise SpeechDisabledError("Speech-to-text is disabled")
target = str(file_path or "").strip()
if not target or not os.path.isfile(target):
raise SpeechServiceError("Audio file not found")
duration_seconds = self._probe_audio_duration_seconds(target)
if duration_seconds is not None and duration_seconds > float(STT_MAX_AUDIO_SECONDS) + 0.3:
raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
prepared_target = self._preprocess_audio(target)
try:
model = self._load_model()
lang = str(language or "").strip().lower()
normalized_lang: Optional[str] = None
if lang and lang not in {"auto", "null", "none"}:
normalized_lang = lang
max_end = 0.0
detected_language = ""
texts = []
kwargs: Dict[str, Any] = {
"print_realtime": False,
"print_progress": False,
"no_context": True,
"suppress_non_speech_tokens": True,
}
if normalized_lang:
kwargs["language"] = normalized_lang
initial_prompt = str(STT_INITIAL_PROMPT or "").strip()
if initial_prompt:
kwargs["initial_prompt"] = initial_prompt
kwargs = self._filter_supported_transcribe_kwargs(model, kwargs)
try:
segments = model.transcribe(prepared_target, **kwargs)
except Exception as exc:
raise SpeechServiceError(
f"pywhispercpp transcription failed: {exc}. "
"If input is not wav, install ffmpeg in runtime image."
) from exc
for segment in segments:
txt = str(getattr(segment, "text", "") or "").strip()
if txt:
texts.append(txt)
if normalized_lang:
detected_language = normalized_lang
try:
max_end = max(max_end, float(getattr(segment, "t1", 0.0) or 0.0) / 100.0)
except Exception:
pass
if max_end > float(STT_MAX_AUDIO_SECONDS) + 0.3:
raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
text = self._normalize_text(" ".join(texts).strip())
if not text:
raise SpeechServiceError("No speech detected")
if duration_seconds is None:
duration_seconds = max_end if max_end > 0 else None
return {
"text": text,
"language": detected_language or None,
"duration_seconds": duration_seconds,
"max_audio_seconds": STT_MAX_AUDIO_SECONDS,
"model": STT_MODEL,
"device": STT_DEVICE,
"backend": self._backend or "unknown",
}
finally:
if prepared_target != target and os.path.exists(prepared_target):
try:
os.remove(prepared_target)
except Exception:
pass

View File

@ -1,5 +1,6 @@
import asyncio
import json
import logging
import mimetypes
import os
import re
@ -12,7 +13,7 @@ from urllib.parse import unquote
import httpx
from pydantic import BaseModel
from fastapi import Depends, FastAPI, File, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
from fastapi import Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
from sqlmodel import Session, select
@ -21,6 +22,12 @@ from core.config_manager import BotConfigManager
from core.cache import cache
from core.database import engine, get_session, init_database
from core.docker_manager import BotDockerManager
from core.speech_service import (
SpeechDisabledError,
SpeechDurationError,
SpeechServiceError,
WhisperSpeechService,
)
from core.settings import (
BOTS_WORKSPACE_ROOT,
DATA_ROOT,
@ -37,11 +44,17 @@ from core.settings import (
REDIS_ENABLED,
REDIS_PREFIX,
REDIS_URL,
STT_DEVICE,
STT_DEFAULT_LANGUAGE,
STT_ENABLED,
STT_MAX_AUDIO_SECONDS,
STT_MODEL,
UPLOAD_MAX_MB,
)
from models.bot import BotInstance, BotMessage, NanobotImage
app = FastAPI(title="Dashboard Nanobot API")
logger = logging.getLogger("dashboard.backend")
app.add_middleware(
CORSMiddleware,
@ -55,6 +68,7 @@ os.makedirs(DATA_ROOT, exist_ok=True)
docker_manager = BotDockerManager(host_data_root=BOTS_WORKSPACE_ROOT)
config_manager = BotConfigManager(host_data_root=BOTS_WORKSPACE_ROOT)
speech_service = WhisperSpeechService()
BOT_ID_PATTERN = re.compile(r"^[A-Za-z0-9_]+$")
@ -501,6 +515,13 @@ def get_system_defaults():
"limits": {
"upload_max_mb": UPLOAD_MAX_MB,
},
"speech": {
"enabled": STT_ENABLED,
"model": STT_MODEL,
"device": STT_DEVICE,
"max_audio_seconds": STT_MAX_AUDIO_SECONDS,
"default_language": STT_DEFAULT_LANGUAGE,
},
}
@ -3117,6 +3138,102 @@ async def upload_workspace_files(
return {"bot_id": bot_id, "files": rows}
@app.post("/api/bots/{bot_id}/speech/transcribe")
async def transcribe_bot_speech(
bot_id: str,
file: UploadFile = File(...),
language: Optional[str] = Form(None),
session: Session = Depends(get_session),
):
bot = session.get(BotInstance, bot_id)
if not bot:
raise HTTPException(status_code=404, detail="Bot not found")
if not STT_ENABLED:
raise HTTPException(status_code=400, detail="Speech recognition is disabled")
if not file:
raise HTTPException(status_code=400, detail="no audio file uploaded")
original_name = str(file.filename or "audio.webm").strip() or "audio.webm"
safe_name = os.path.basename(original_name).replace("\\", "_").replace("/", "_")
ext = os.path.splitext(safe_name)[1].strip().lower() or ".webm"
if len(ext) > 12:
ext = ".webm"
tmp_path = ""
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix=".speech_", dir=DATA_ROOT) as tmp:
tmp_path = tmp.name
while True:
chunk = await file.read(1024 * 1024)
if not chunk:
break
tmp.write(chunk)
if not tmp_path or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
raise HTTPException(status_code=400, detail="audio payload is empty")
resolved_language = str(language or "").strip() or STT_DEFAULT_LANGUAGE
result = await asyncio.to_thread(speech_service.transcribe_file, tmp_path, resolved_language)
text = str(result.get("text") or "").strip()
if not text:
raise HTTPException(status_code=400, detail="No speech detected")
return {
"bot_id": bot_id,
"text": text,
"duration_seconds": result.get("duration_seconds"),
"max_audio_seconds": STT_MAX_AUDIO_SECONDS,
"model": STT_MODEL,
"device": STT_DEVICE,
"language": result.get("language") or resolved_language,
}
except SpeechDisabledError as exc:
logger.warning(
"speech transcribe disabled bot_id=%s file=%s language=%s detail=%s",
bot_id,
safe_name,
language,
exc,
)
raise HTTPException(status_code=400, detail=str(exc))
except SpeechDurationError:
logger.warning(
"speech transcribe too long bot_id=%s file=%s language=%s max_seconds=%s",
bot_id,
safe_name,
language,
STT_MAX_AUDIO_SECONDS,
)
raise HTTPException(status_code=413, detail=f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
except SpeechServiceError as exc:
logger.exception(
"speech transcribe failed bot_id=%s file=%s language=%s",
bot_id,
safe_name,
language,
)
raise HTTPException(status_code=400, detail=str(exc))
except HTTPException:
raise
except Exception as exc:
logger.exception(
"speech transcribe unexpected error bot_id=%s file=%s language=%s",
bot_id,
safe_name,
language,
)
raise HTTPException(status_code=500, detail=f"speech transcription failed: {exc}")
finally:
try:
await file.close()
except Exception:
pass
if tmp_path and os.path.exists(tmp_path):
try:
os.remove(tmp_path)
except Exception:
pass
@app.websocket("/ws/monitor/{bot_id}")
async def websocket_endpoint(websocket: WebSocket, bot_id: str):
with Session(engine) as session:

View File

@ -15,3 +15,5 @@ watchfiles==0.21.0
urllib3==1.26.18
requests==2.31.0
redis==5.0.8
opencc-purepy==1.1.0
pywhispercpp==1.3.1

View File

@ -24,6 +24,16 @@ services:
REDIS_PREFIX: ${REDIS_PREFIX:-dashboard_nanobot}
REDIS_DEFAULT_TTL: ${REDIS_DEFAULT_TTL:-60}
PANEL_ACCESS_PASSWORD: ${PANEL_ACCESS_PASSWORD:-}
STT_ENABLED: ${STT_ENABLED:-true}
STT_MODEL: ${STT_MODEL:-ggml-small-q8_0.bin}
STT_MODEL_DIR: ${STT_MODEL_DIR:-${HOST_DATA_ROOT}/model}
STT_DEVICE: ${STT_DEVICE:-cpu}
STT_MAX_AUDIO_SECONDS: ${STT_MAX_AUDIO_SECONDS:-20}
STT_DEFAULT_LANGUAGE: ${STT_DEFAULT_LANGUAGE:-zh}
STT_FORCE_SIMPLIFIED: ${STT_FORCE_SIMPLIFIED:-true}
STT_AUDIO_PREPROCESS: ${STT_AUDIO_PREPROCESS:-true}
STT_AUDIO_FILTER: ${STT_AUDIO_FILTER:-highpass=f=120,lowpass=f=7600,afftdn=nf=-20}
STT_INITIAL_PROMPT: ${STT_INITIAL_PROMPT:-以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。}
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- ${HOST_DATA_ROOT}:${HOST_DATA_ROOT}

View File

@ -28,7 +28,19 @@ export const dashboardEn = {
copyPromptFail: 'Failed to copy prompt.',
editPromptDone: 'Inserted into composer.',
voiceInput: 'Voice input',
voiceUnavailable: 'Voice input is not available yet.',
textInput: 'Text input',
voiceUnavailable: 'Speech recognition is disabled.',
voiceUnsupported: 'Your browser does not support audio recording.',
voicePermissionDenied: 'Microphone permission denied. Please allow access in browser settings.',
voiceRecordFail: 'Audio recording failed. Please retry.',
voiceReady: 'Click the mic to start recording',
voiceRecording: 'Recording...',
voiceTranscribing: 'Transcribing...',
voiceStart: 'Start recording',
voiceStop: 'Stop recording',
voiceTranscribeDone: 'Voice converted to text.',
voiceTranscribeEmpty: 'No valid speech detected.',
voiceTranscribeFail: 'Speech transcription failed.',
copyReply: 'Copy reply',
copyReplyDone: 'Reply copied.',
copyReplyFail: 'Failed to copy reply.',

View File

@ -28,7 +28,19 @@ export const dashboardZhCn = {
copyPromptFail: '复制指令失败。',
editPromptDone: '已填入输入框。',
voiceInput: '语音输入',
voiceUnavailable: '语音输入暂未接入。',
textInput: '文字输入',
voiceUnavailable: '语音识别未启用。',
voiceUnsupported: '当前浏览器不支持录音。',
voicePermissionDenied: '麦克风权限被拒绝,请在浏览器设置中允许访问。',
voiceRecordFail: '录音失败,请重试。',
voiceReady: '点击麦克风开始录音',
voiceRecording: '录音中...',
voiceTranscribing: '语音识别中...',
voiceStart: '开始录音',
voiceStop: '停止录音',
voiceTranscribeDone: '语音已转为文本。',
voiceTranscribeEmpty: '未识别到有效语音内容。',
voiceTranscribeFail: '语音识别失败。',
copyReply: '复制回复',
copyReplyDone: '回复已复制。',
copyReplyFail: '复制回复失败。',

View File

@ -1085,17 +1085,126 @@
padding: 14px 120px 42px 14px;
}
.ops-voice-panel {
min-height: 96px;
border: 1px dashed color-mix(in oklab, var(--line) 72%, var(--brand) 28%);
border-radius: 12px;
background: color-mix(in oklab, var(--panel) 78%, var(--panel-soft) 22%);
padding: 12px 14px 12px 14px;
display: grid;
align-content: center;
gap: 10px;
}
.ops-voice-title {
font-size: 13px;
font-weight: 700;
color: var(--muted);
}
.ops-voice-wave {
height: 28px;
border-radius: 999px;
border: 1px solid color-mix(in oklab, var(--line) 76%, transparent);
background: color-mix(in oklab, var(--panel-soft) 78%, var(--panel) 22%);
display: flex;
align-items: center;
gap: 8px;
padding: 0 6px;
overflow: hidden;
flex: 1 1 auto;
min-width: 0;
}
.ops-voice-wave-segment {
height: 100%;
min-width: 0;
display: flex;
align-items: center;
justify-content: space-between;
gap: 2px;
padding: 0 6px;
border-radius: 999px;
background: color-mix(in oklab, var(--panel) 60%, rgba(255, 255, 255, 0.18) 40%);
}
.ops-voice-wave.is-mobile .ops-voice-wave-segment {
flex: 1 1 auto;
}
.ops-voice-wave.is-desktop .ops-voice-wave-segment {
flex: 1 1 0;
}
.ops-voice-wave-segment i {
display: inline-block;
width: 3px;
min-width: 3px;
height: 10px;
border-radius: 999px;
background: color-mix(in oklab, var(--line) 72%, var(--text) 28%);
opacity: 0.72;
}
.ops-voice-wave-segment i:nth-child(3n) {
height: 14px;
}
.ops-voice-wave-segment i:nth-child(4n) {
height: 18px;
}
.ops-voice-wave-segment i:nth-child(5n) {
height: 12px;
}
.ops-voice-wave.is-live .ops-voice-wave-segment i {
background: color-mix(in oklab, var(--brand) 60%, #8ec3ff 40%);
animation: ops-voice-wave 1.05s ease-in-out infinite;
}
.ops-voice-countdown {
flex: 0 0 auto;
font-size: 13px;
font-weight: 700;
color: var(--title);
min-width: 44px;
text-align: right;
}
.ops-composer-tools-right {
position: absolute;
bottom: 14px;
display: inline-flex;
left: 12px;
right: 12px;
display: flex;
align-items: center;
justify-content: flex-end;
gap: 6px;
}
.ops-composer-tools-right {
right: 12px;
max-width: calc(100% - 24px);
width: auto;
}
.ops-voice-inline {
min-width: 0;
flex: 1 1 auto;
display: flex;
align-items: center;
gap: 8px;
margin-right: 4px;
}
@media (max-width: 720px) {
.ops-voice-wave {
gap: 4px;
padding: 0 4px;
}
.ops-voice-wave-segment {
padding: 0 4px;
}
}
.ops-composer-inline-btn {
@ -1116,6 +1225,11 @@
color: var(--icon);
}
.ops-composer-inline-btn.is-active {
background: color-mix(in oklab, var(--brand-soft) 42%, var(--panel) 58%);
color: var(--brand);
}
.ops-composer-submit-btn {
width: 34px;
height: 34px;
@ -1224,6 +1338,17 @@
100% { transform: translateX(430%); }
}
@keyframes ops-voice-wave {
0%, 100% {
transform: scaleY(0.55);
opacity: 0.35;
}
50% {
transform: scaleY(1.95);
opacity: 1;
}
}
.ops-pending-chip {
display: inline-flex;
align-items: center;

View File

@ -249,6 +249,12 @@ interface SystemDefaultsResponse {
limits?: {
upload_max_mb?: number;
};
speech?: {
enabled?: boolean;
model?: string;
device?: string;
max_audio_seconds?: number;
};
}
type BotEnvParams = Record<string, string>;
@ -719,6 +725,11 @@ export function BotDashboardModule({
const fileNotPreviewableLabel = locale === 'zh' ? '当前文件类型不支持预览' : 'This file type is not previewable';
const [selectedBotId, setSelectedBotId] = useState('');
const [command, setCommand] = useState('');
const [speechEnabled, setSpeechEnabled] = useState(true);
const [voiceMaxSeconds, setVoiceMaxSeconds] = useState(20);
const [isVoiceRecording, setIsVoiceRecording] = useState(false);
const [isVoiceTranscribing, setIsVoiceTranscribing] = useState(false);
const [voiceCountdown, setVoiceCountdown] = useState(20);
const [isSaving, setIsSaving] = useState(false);
const [showBaseModal, setShowBaseModal] = useState(false);
const [showParamModal, setShowParamModal] = useState(false);
@ -798,6 +809,10 @@ export function BotDashboardModule({
const [feedbackSavingByMessageId, setFeedbackSavingByMessageId] = useState<Record<number, boolean>>({});
const [showRuntimeActionModal, setShowRuntimeActionModal] = useState(false);
const [workspaceHoverCard, setWorkspaceHoverCard] = useState<WorkspaceHoverCardState | null>(null);
const voiceRecorderRef = useRef<MediaRecorder | null>(null);
const voiceStreamRef = useRef<MediaStream | null>(null);
const voiceChunksRef = useRef<BlobPart[]>([]);
const voiceTimerRef = useRef<number | null>(null);
const runtimeMenuRef = useRef<HTMLDivElement | null>(null);
const botOrderRef = useRef<Record<string, number>>({});
const nextBotOrderRef = useRef(1);
@ -1544,16 +1559,36 @@ export function BotDashboardModule({
persistComposerDraft(selectedBotId, command, pendingAttachments);
}, [selectedBotId, composerDraftHydrated, command, pendingAttachments]);
useEffect(() => {
return () => {
clearVoiceTimer();
try {
if (voiceRecorderRef.current && voiceRecorderRef.current.state !== 'inactive') {
voiceRecorderRef.current.stop();
}
} catch {
// ignore
}
releaseVoiceStream();
};
}, []);
useEffect(() => {
if (!isVoiceRecording && !isVoiceTranscribing) {
setVoiceCountdown(voiceMaxSeconds);
}
}, [voiceMaxSeconds, isVoiceRecording, isVoiceTranscribing]);
useEffect(() => {
const hasDraft = Boolean(String(command || '').trim()) || pendingAttachments.length > 0 || Boolean(quotedReply);
if (!hasDraft && !isUploadingAttachments) return;
if (!hasDraft && !isUploadingAttachments && !isVoiceRecording && !isVoiceTranscribing) return;
const onBeforeUnload = (event: BeforeUnloadEvent) => {
event.preventDefault();
event.returnValue = '';
};
window.addEventListener('beforeunload', onBeforeUnload);
return () => window.removeEventListener('beforeunload', onBeforeUnload);
}, [command, pendingAttachments.length, quotedReply, isUploadingAttachments]);
}, [command, pendingAttachments.length, quotedReply, isUploadingAttachments, isVoiceRecording, isVoiceTranscribing]);
const syncChatScrollToBottom = useCallback((behavior: ScrollBehavior = 'auto') => {
const box = chatScrollRef.current;
@ -1580,6 +1615,9 @@ export function BotDashboardModule({
useEffect(() => {
setQuotedReply(null);
if (isVoiceRecording) {
stopVoiceRecording();
}
}, [selectedBotId]);
useEffect(() => {
@ -1637,9 +1675,21 @@ export function BotDashboardModule({
const loadSystemDefaults = async () => {
try {
const res = await axios.get<SystemDefaultsResponse>(`${APP_ENDPOINTS.apiBase}/system/defaults`);
if (!alive) return;
const configured = Number(res.data?.limits?.upload_max_mb);
if (!Number.isFinite(configured) || configured <= 0 || !alive) return;
setUploadMaxMb(Math.max(1, Math.floor(configured)));
if (Number.isFinite(configured) && configured > 0) {
setUploadMaxMb(Math.max(1, Math.floor(configured)));
}
const speechEnabledRaw = res.data?.speech?.enabled;
if (typeof speechEnabledRaw === 'boolean') {
setSpeechEnabled(speechEnabledRaw);
}
const speechSeconds = Number(res.data?.speech?.max_audio_seconds);
if (Number.isFinite(speechSeconds) && speechSeconds > 0) {
const normalized = Math.max(5, Math.floor(speechSeconds));
setVoiceMaxSeconds(normalized);
setVoiceCountdown(normalized);
}
} catch {
// keep default limit
}
@ -2642,8 +2692,155 @@ export function BotDashboardModule({
filePickerRef.current?.click();
};
const clearVoiceTimer = () => {
if (voiceTimerRef.current) {
window.clearInterval(voiceTimerRef.current);
voiceTimerRef.current = null;
}
};
const releaseVoiceStream = () => {
if (voiceStreamRef.current) {
voiceStreamRef.current.getTracks().forEach((track) => {
try {
track.stop();
} catch {
// ignore
}
});
voiceStreamRef.current = null;
}
};
const transcribeVoiceBlob = async (blob: Blob) => {
if (!selectedBot || blob.size <= 0) return;
setIsVoiceTranscribing(true);
try {
const mime = String(blob.type || '').toLowerCase();
const ext = mime.includes('ogg') ? 'ogg' : mime.includes('mp4') ? 'mp4' : 'webm';
const file = new File([blob], `voice-input-${Date.now()}.${ext}`, { type: blob.type || 'audio/webm' });
const formData = new FormData();
formData.append('file', file);
formData.append('language', 'zh');
const res = await axios.post<{ text?: string }>(
`${APP_ENDPOINTS.apiBase}/bots/${selectedBot.id}/speech/transcribe`,
formData,
{ timeout: 120000 },
);
const text = normalizeUserMessageText(String(res.data?.text || ''));
if (!text) {
notify(t.voiceTranscribeEmpty, { tone: 'warning' });
return;
}
setCommand((prev) => {
const base = String(prev || '').trim();
if (!base) return text;
return `${base}\n${text}`;
});
window.requestAnimationFrame(() => composerTextareaRef.current?.focus());
notify(t.voiceTranscribeDone, { tone: 'success' });
} catch (error: any) {
const msg = String(error?.response?.data?.detail || '').trim();
console.error('Speech transcription failed', {
botId: selectedBot.id,
message: msg || t.voiceTranscribeFail,
status: error?.response?.status,
response: error?.response?.data,
error,
});
notify(msg || t.voiceTranscribeFail, { tone: 'error' });
} finally {
setIsVoiceTranscribing(false);
}
};
const stopVoiceRecording = () => {
const recorder = voiceRecorderRef.current;
if (!recorder || recorder.state === 'inactive') return;
try {
recorder.stop();
} catch {
// ignore
}
};
const startVoiceRecording = async () => {
if (!selectedBot || !canChat || isVoiceTranscribing) return;
if (!speechEnabled) {
notify(t.voiceUnavailable, { tone: 'warning' });
return;
}
if (typeof window === 'undefined' || typeof navigator === 'undefined' || !navigator.mediaDevices?.getUserMedia) {
notify(t.voiceUnsupported, { tone: 'error' });
return;
}
if (typeof MediaRecorder === 'undefined') {
notify(t.voiceUnsupported, { tone: 'error' });
return;
}
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const mimeCandidates = ['audio/webm;codecs=opus', 'audio/webm', 'audio/ogg;codecs=opus', 'audio/mp4'];
const supportedMime = mimeCandidates.find((candidate) => MediaRecorder.isTypeSupported(candidate));
const recorder = supportedMime
? new MediaRecorder(stream, { mimeType: supportedMime })
: new MediaRecorder(stream);
voiceStreamRef.current = stream;
voiceRecorderRef.current = recorder;
voiceChunksRef.current = [];
setVoiceCountdown(voiceMaxSeconds);
setIsVoiceRecording(true);
recorder.ondataavailable = (event: BlobEvent) => {
if (event.data && event.data.size > 0) {
voiceChunksRef.current.push(event.data);
}
};
recorder.onerror = () => {
setIsVoiceRecording(false);
clearVoiceTimer();
releaseVoiceStream();
notify(t.voiceRecordFail, { tone: 'error' });
};
recorder.onstop = () => {
const blob = new Blob(voiceChunksRef.current, { type: supportedMime || recorder.mimeType || 'audio/webm' });
voiceRecorderRef.current = null;
voiceChunksRef.current = [];
clearVoiceTimer();
releaseVoiceStream();
setIsVoiceRecording(false);
setVoiceCountdown(voiceMaxSeconds);
if (blob.size > 0) {
void transcribeVoiceBlob(blob);
}
};
recorder.start(200);
clearVoiceTimer();
voiceTimerRef.current = window.setInterval(() => {
setVoiceCountdown((prev) => {
if (prev <= 1) {
stopVoiceRecording();
return 0;
}
return prev - 1;
});
}, 1000);
} catch {
releaseVoiceStream();
setIsVoiceRecording(false);
clearVoiceTimer();
notify(t.voicePermissionDenied, { tone: 'error' });
}
};
const onVoiceInput = () => {
notify(t.voiceUnavailable, { tone: 'warning' });
if (isVoiceTranscribing) return;
if (isVoiceRecording) {
stopVoiceRecording();
return;
}
void startVoiceRecording();
};
const onPickAttachments = async (event: ChangeEvent<HTMLInputElement>) => {
@ -3393,7 +3590,7 @@ export function BotDashboardModule({
value={command}
onChange={(e) => setCommand(e.target.value)}
onKeyDown={onComposerKeyDown}
disabled={!canChat}
disabled={!canChat || isVoiceRecording || isVoiceTranscribing}
placeholder={
canChat
? t.inputPlaceholder
@ -3401,18 +3598,54 @@ export function BotDashboardModule({
}
/>
<div className="ops-composer-tools-right">
<LucentIconButton
className="ops-composer-inline-btn"
disabled={!canChat}
{(isVoiceRecording || isVoiceTranscribing) ? (
<div className="ops-voice-inline" aria-live="polite">
<div className={`ops-voice-wave ${isVoiceRecording ? 'is-live' : ''} ${isCompactMobile ? 'is-mobile' : 'is-desktop'}`}>
{Array.from({ length: isCompactMobile ? 1 : 5 }).map((_, segmentIdx) => (
<div key={`vw-segment-${segmentIdx}`} className="ops-voice-wave-segment">
{Array.from({ length: isCompactMobile ? 28 : 18 }).map((_, idx) => {
const delayIndex = isCompactMobile
? idx
: (segmentIdx * 18) + idx;
return (
<i
key={`vw-inline-${segmentIdx}-${idx}`}
style={{ animationDelay: `${(delayIndex % 14) * 0.06}s` }}
/>
);
})}
</div>
))}
</div>
<div className="ops-voice-countdown mono">
{isVoiceRecording ? `${voiceCountdown}s` : t.voiceTranscribing}
</div>
</div>
) : null}
<button
className={`ops-composer-inline-btn ${isVoiceRecording ? 'is-recording' : ''}`}
disabled={!canChat || isVoiceTranscribing || (!speechEnabled && !isVoiceRecording)}
onClick={onVoiceInput}
tooltip={t.voiceInput}
aria-label={t.voiceInput}
aria-label={isVoiceRecording ? t.voiceStop : t.voiceStart}
title={
isVoiceTranscribing
? t.voiceTranscribing
: isVoiceRecording
? t.voiceStop
: t.voiceStart
}
>
<Mic size={16} />
</LucentIconButton>
{isVoiceTranscribing ? (
<RefreshCw size={16} className="animate-spin" />
) : isVoiceRecording ? (
<Square size={16} />
) : (
<Mic size={16} />
)}
</button>
<LucentIconButton
className="ops-composer-inline-btn"
disabled={!canChat || isUploadingAttachments}
disabled={!canChat || isUploadingAttachments || isVoiceRecording || isVoiceTranscribing}
onClick={triggerPickAttachments}
tooltip={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
aria-label={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
@ -3424,7 +3657,12 @@ export function BotDashboardModule({
disabled={
isChatEnabled && (isThinking || isSending)
? Boolean(interruptingByBot[selectedBot.id])
: (!isChatEnabled || (!command.trim() && pendingAttachments.length === 0 && !quotedReply))
: (
!isChatEnabled
|| isVoiceRecording
|| isVoiceTranscribing
|| (!command.trim() && pendingAttachments.length === 0 && !quotedReply)
)
}
onClick={() => void (isChatEnabled && (isThinking || isSending) ? interruptExecution() : send())}
aria-label={isChatEnabled && (isThinking || isSending) ? t.interrupt : t.send}

View File

@ -11,6 +11,7 @@ if [[ ! -f "$ENV_FILE" ]]; then
fi
echo "[deploy] using env: $ENV_FILE"
docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" config -q
docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" up -d --build
echo "[deploy] service status"

View File

@ -4,4 +4,9 @@ set -euo pipefail
ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
ENV_FILE="${1:-$ROOT_DIR/.env.prod}"
if [[ ! -f "$ENV_FILE" ]]; then
echo "Missing env file: $ENV_FILE"
exit 1
fi
docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" down