v0.1.4-p1
parent
590eae9f0c
commit
6795fedbfe
|
|
@ -43,3 +43,15 @@ PANEL_ACCESS_PASSWORD=change_me_panel_password
|
||||||
|
|
||||||
# Max upload size for backend validation (MB)
|
# Max upload size for backend validation (MB)
|
||||||
UPLOAD_MAX_MB=200
|
UPLOAD_MAX_MB=200
|
||||||
|
|
||||||
|
# Local speech-to-text (Whisper via whisper.cpp model file)
|
||||||
|
STT_ENABLED=true
|
||||||
|
STT_MODEL=ggml-small-q8_0.bin
|
||||||
|
STT_MODEL_DIR=${HOST_DATA_ROOT}/model
|
||||||
|
STT_DEVICE=cpu
|
||||||
|
STT_MAX_AUDIO_SECONDS=20
|
||||||
|
STT_DEFAULT_LANGUAGE=zh
|
||||||
|
STT_FORCE_SIMPLIFIED=true
|
||||||
|
STT_AUDIO_PREPROCESS=true
|
||||||
|
STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20
|
||||||
|
STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。
|
||||||
|
|
|
||||||
|
|
@ -104,6 +104,8 @@ graph TD
|
||||||
- 配置绝对路径:
|
- 配置绝对路径:
|
||||||
- `HOST_DATA_ROOT`
|
- `HOST_DATA_ROOT`
|
||||||
- `HOST_BOTS_WORKSPACE_ROOT`
|
- `HOST_BOTS_WORKSPACE_ROOT`
|
||||||
|
- 如启用本地语音识别,请将 Whisper `.bin` 模型文件放到 `${HOST_DATA_ROOT}/model/`
|
||||||
|
并让 `STT_MODEL` 指向完整文件名,例如 `ggml-small-q8_0.bin`
|
||||||
- 中国网络建议配置加速项:
|
- 中国网络建议配置加速项:
|
||||||
- `PIP_INDEX_URL`、`PIP_TRUSTED_HOST`
|
- `PIP_INDEX_URL`、`PIP_TRUSTED_HOST`
|
||||||
- `NPM_REGISTRY`
|
- `NPM_REGISTRY`
|
||||||
|
|
@ -120,3 +122,4 @@ graph TD
|
||||||
- 必须挂载 `/var/run/docker.sock`,否则后端无法操作 Bot 镜像与容器。
|
- 必须挂载 `/var/run/docker.sock`,否则后端无法操作 Bot 镜像与容器。
|
||||||
- `HOST_BOTS_WORKSPACE_ROOT` 必须是宿主机绝对路径,并且在 `docker-compose.prod.yml` 中以“同路径”挂载到后端容器。
|
- `HOST_BOTS_WORKSPACE_ROOT` 必须是宿主机绝对路径,并且在 `docker-compose.prod.yml` 中以“同路径”挂载到后端容器。
|
||||||
原因:后端通过 Docker API 创建 Bot 容器时,使用的是宿主机可见的 bind 路径。
|
原因:后端通过 Docker API 创建 Bot 容器时,使用的是宿主机可见的 bind 路径。
|
||||||
|
- 语音识别当前基于 `pywhispercpp==1.3.1` + Whisper `.bin` 模型文件,不使用 `faster-whisper`。
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,18 @@ PANEL_ACCESS_PASSWORD=
|
||||||
# Max upload size for backend validation (MB)
|
# Max upload size for backend validation (MB)
|
||||||
UPLOAD_MAX_MB=100
|
UPLOAD_MAX_MB=100
|
||||||
|
|
||||||
|
# Local speech-to-text (Whisper via whisper.cpp model file)
|
||||||
|
STT_ENABLED=true
|
||||||
|
STT_MODEL=ggml-small-q8_0.bin
|
||||||
|
STT_MODEL_DIR=../data/model
|
||||||
|
STT_DEVICE=cpu
|
||||||
|
STT_MAX_AUDIO_SECONDS=20
|
||||||
|
STT_DEFAULT_LANGUAGE=zh
|
||||||
|
STT_FORCE_SIMPLIFIED=true
|
||||||
|
STT_AUDIO_PREPROCESS=true
|
||||||
|
STT_AUDIO_FILTER=highpass=f=120,lowpass=f=7600,afftdn=nf=-20
|
||||||
|
STT_INITIAL_PROMPT=以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。
|
||||||
|
|
||||||
# Local backend server options (for `python3 main.py`)
|
# Local backend server options (for `python3 main.py`)
|
||||||
APP_HOST=0.0.0.0
|
APP_HOST=0.0.0.0
|
||||||
APP_PORT=8000
|
APP_PORT=8000
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,9 @@ ARG PIP_TRUSTED_HOST=
|
||||||
COPY backend/requirements.txt ./requirements.txt
|
COPY backend/requirements.txt ./requirements.txt
|
||||||
RUN if [ -n "${PIP_INDEX_URL}" ]; then pip config set global.index-url "${PIP_INDEX_URL}"; fi \
|
RUN if [ -n "${PIP_INDEX_URL}" ]; then pip config set global.index-url "${PIP_INDEX_URL}"; fi \
|
||||||
&& if [ -n "${PIP_TRUSTED_HOST}" ]; then pip config set global.trusted-host "${PIP_TRUSTED_HOST}"; fi \
|
&& if [ -n "${PIP_TRUSTED_HOST}" ]; then pip config set global.trusted-host "${PIP_TRUSTED_HOST}"; fi \
|
||||||
|
&& apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends ffmpeg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& pip install --upgrade pip \
|
&& pip install --upgrade pip \
|
||||||
&& pip install -r requirements.txt
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,7 @@ def _normalize_dir_path(path_value: str) -> str:
|
||||||
raw = str(path_value or "").strip()
|
raw = str(path_value or "").strip()
|
||||||
if not raw:
|
if not raw:
|
||||||
return raw
|
return raw
|
||||||
|
raw = os.path.expandvars(os.path.expanduser(raw))
|
||||||
p = Path(raw)
|
p = Path(raw)
|
||||||
if p.is_absolute():
|
if p.is_absolute():
|
||||||
return str(p)
|
return str(p)
|
||||||
|
|
@ -117,6 +118,26 @@ DATABASE_ENGINE: Final[str] = _database_engine(DATABASE_URL)
|
||||||
DATABASE_URL_DISPLAY: Final[str] = _mask_database_url(DATABASE_URL)
|
DATABASE_URL_DISPLAY: Final[str] = _mask_database_url(DATABASE_URL)
|
||||||
DATABASE_ECHO: Final[bool] = _env_bool("DATABASE_ECHO", True)
|
DATABASE_ECHO: Final[bool] = _env_bool("DATABASE_ECHO", True)
|
||||||
UPLOAD_MAX_MB: Final[int] = _env_int("UPLOAD_MAX_MB", 100, 1, 2048)
|
UPLOAD_MAX_MB: Final[int] = _env_int("UPLOAD_MAX_MB", 100, 1, 2048)
|
||||||
|
STT_ENABLED: Final[bool] = _env_bool("STT_ENABLED", True)
|
||||||
|
STT_MODEL: Final[str] = str(os.getenv("STT_MODEL") or "ggml-small-q8_0.bin").strip()
|
||||||
|
_DEFAULT_STT_MODEL_DIR: Final[Path] = (Path(DATA_ROOT) / "model").resolve()
|
||||||
|
_configured_stt_model_dir = _normalize_dir_path(os.getenv("STT_MODEL_DIR", str(_DEFAULT_STT_MODEL_DIR)))
|
||||||
|
if _configured_stt_model_dir and not Path(_configured_stt_model_dir).exists() and _DEFAULT_STT_MODEL_DIR.exists():
|
||||||
|
STT_MODEL_DIR: Final[str] = str(_DEFAULT_STT_MODEL_DIR)
|
||||||
|
else:
|
||||||
|
STT_MODEL_DIR: Final[str] = _configured_stt_model_dir
|
||||||
|
STT_DEVICE: Final[str] = str(os.getenv("STT_DEVICE") or "cpu").strip().lower() or "cpu"
|
||||||
|
STT_MAX_AUDIO_SECONDS: Final[int] = _env_int("STT_MAX_AUDIO_SECONDS", 20, 5, 600)
|
||||||
|
STT_DEFAULT_LANGUAGE: Final[str] = str(os.getenv("STT_DEFAULT_LANGUAGE") or "zh").strip().lower() or "zh"
|
||||||
|
STT_FORCE_SIMPLIFIED: Final[bool] = _env_bool("STT_FORCE_SIMPLIFIED", True)
|
||||||
|
STT_AUDIO_PREPROCESS: Final[bool] = _env_bool("STT_AUDIO_PREPROCESS", True)
|
||||||
|
STT_AUDIO_FILTER: Final[str] = str(
|
||||||
|
os.getenv("STT_AUDIO_FILTER") or "highpass=f=120,lowpass=f=7600,afftdn=nf=-20"
|
||||||
|
).strip()
|
||||||
|
STT_INITIAL_PROMPT: Final[str] = str(
|
||||||
|
os.getenv("STT_INITIAL_PROMPT")
|
||||||
|
or "以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。"
|
||||||
|
).strip()
|
||||||
|
|
||||||
REDIS_ENABLED: Final[bool] = _env_bool("REDIS_ENABLED", False)
|
REDIS_ENABLED: Final[bool] = _env_bool("REDIS_ENABLED", False)
|
||||||
REDIS_URL: Final[str] = str(os.getenv("REDIS_URL") or "").strip()
|
REDIS_URL: Final[str] = str(os.getenv("REDIS_URL") or "").strip()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,259 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from core.settings import (
|
||||||
|
STT_AUDIO_FILTER,
|
||||||
|
STT_AUDIO_PREPROCESS,
|
||||||
|
STT_DEVICE,
|
||||||
|
STT_ENABLED,
|
||||||
|
STT_FORCE_SIMPLIFIED,
|
||||||
|
STT_INITIAL_PROMPT,
|
||||||
|
STT_MAX_AUDIO_SECONDS,
|
||||||
|
STT_MODEL,
|
||||||
|
STT_MODEL_DIR,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechServiceError(RuntimeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechDisabledError(SpeechServiceError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechDurationError(SpeechServiceError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class WhisperSpeechService:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._model: Any = None
|
||||||
|
self._model_source: str = ""
|
||||||
|
self._backend: str = ""
|
||||||
|
self._model_lock = threading.Lock()
|
||||||
|
|
||||||
|
def _resolve_model_source(self) -> str:
|
||||||
|
model = str(STT_MODEL or "").strip()
|
||||||
|
model_dir = str(STT_MODEL_DIR or "").strip()
|
||||||
|
|
||||||
|
if not model:
|
||||||
|
raise SpeechServiceError(
|
||||||
|
"STT_MODEL is empty. Please set the full model file name, e.g. ggml-samll-q8_0.bin."
|
||||||
|
)
|
||||||
|
|
||||||
|
# If STT_MODEL itself is an absolute/relative path, use it directly.
|
||||||
|
if any(sep in model for sep in ("/", "\\")):
|
||||||
|
direct = Path(model).expanduser()
|
||||||
|
if not direct.exists() or not direct.is_file():
|
||||||
|
raise SpeechServiceError(f"STT model file not found: {direct}")
|
||||||
|
if direct.suffix.lower() != ".bin":
|
||||||
|
raise SpeechServiceError(
|
||||||
|
"STT_MODEL must point to a whisper.cpp ggml .bin model file."
|
||||||
|
)
|
||||||
|
return str(direct.resolve())
|
||||||
|
|
||||||
|
# Strict mode: only exact filename, no alias/auto detection.
|
||||||
|
if Path(model).suffix.lower() != ".bin":
|
||||||
|
raise SpeechServiceError(
|
||||||
|
"STT_MODEL must be the exact model file name (with .bin), e.g. ggml-small-q8_0.bin."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not model_dir:
|
||||||
|
raise SpeechServiceError("STT_MODEL_DIR is empty.")
|
||||||
|
root = Path(model_dir).expanduser()
|
||||||
|
if not root.exists() or not root.is_dir():
|
||||||
|
raise SpeechServiceError(f"STT_MODEL_DIR does not exist: {root}")
|
||||||
|
candidate = root / model
|
||||||
|
if not candidate.exists() or not candidate.is_file():
|
||||||
|
raise SpeechServiceError(
|
||||||
|
f"STT model file not found under STT_MODEL_DIR: {candidate}"
|
||||||
|
)
|
||||||
|
return str(candidate.resolve())
|
||||||
|
|
||||||
|
def _load_model(self) -> Any:
|
||||||
|
model_source = self._resolve_model_source()
|
||||||
|
if self._model is not None and self._model_source == model_source:
|
||||||
|
return self._model
|
||||||
|
with self._model_lock:
|
||||||
|
if self._model is not None and self._model_source == model_source:
|
||||||
|
return self._model
|
||||||
|
try:
|
||||||
|
from pywhispercpp.model import Model # type: ignore
|
||||||
|
except Exception as exc:
|
||||||
|
raise SpeechServiceError(
|
||||||
|
"pywhispercpp is not installed in the active backend environment. "
|
||||||
|
"Run pip install -r backend/requirements.txt or rebuild the backend image."
|
||||||
|
) from exc
|
||||||
|
self._model = Model(
|
||||||
|
model_source,
|
||||||
|
print_realtime=False,
|
||||||
|
print_progress=False,
|
||||||
|
)
|
||||||
|
self._backend = "pywhispercpp"
|
||||||
|
self._model_source = model_source
|
||||||
|
return self._model
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _preprocess_audio(file_path: str) -> str:
|
||||||
|
target = str(file_path or "").strip()
|
||||||
|
if not STT_AUDIO_PREPROCESS or not target or not os.path.isfile(target):
|
||||||
|
return target
|
||||||
|
if shutil.which("ffmpeg") is None:
|
||||||
|
return target
|
||||||
|
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav", prefix=".speech_clean_")
|
||||||
|
tmp_path = tmp.name
|
||||||
|
tmp.close()
|
||||||
|
cmd = [
|
||||||
|
"ffmpeg",
|
||||||
|
"-y",
|
||||||
|
"-i",
|
||||||
|
target,
|
||||||
|
"-vn",
|
||||||
|
"-ac",
|
||||||
|
"1",
|
||||||
|
"-ar",
|
||||||
|
"16000",
|
||||||
|
]
|
||||||
|
audio_filter = str(STT_AUDIO_FILTER or "").strip()
|
||||||
|
if audio_filter:
|
||||||
|
cmd.extend(["-af", audio_filter])
|
||||||
|
cmd.extend(["-c:a", "pcm_s16le", tmp_path])
|
||||||
|
try:
|
||||||
|
completed = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
if completed.returncode != 0 or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
|
||||||
|
if os.path.exists(tmp_path):
|
||||||
|
os.remove(tmp_path)
|
||||||
|
return target
|
||||||
|
return tmp_path
|
||||||
|
except Exception:
|
||||||
|
if os.path.exists(tmp_path):
|
||||||
|
os.remove(tmp_path)
|
||||||
|
return target
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _probe_audio_duration_seconds(file_path: str) -> Optional[float]:
|
||||||
|
try:
|
||||||
|
import av # type: ignore
|
||||||
|
|
||||||
|
with av.open(file_path) as container:
|
||||||
|
if container.duration is not None:
|
||||||
|
# container.duration is in av.time_base units.
|
||||||
|
return max(0.0, float(container.duration / av.time_base))
|
||||||
|
for stream in container.streams:
|
||||||
|
if stream.type != "audio":
|
||||||
|
continue
|
||||||
|
if stream.duration is not None and stream.time_base is not None:
|
||||||
|
return max(0.0, float(stream.duration * stream.time_base))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize_text(text: str) -> str:
|
||||||
|
content = str(text or "").strip()
|
||||||
|
if not content or not STT_FORCE_SIMPLIFIED:
|
||||||
|
return content
|
||||||
|
try:
|
||||||
|
from opencc_purepy import OpenCC # type: ignore
|
||||||
|
|
||||||
|
return str(OpenCC("t2s").convert(content) or "").strip() or content
|
||||||
|
except Exception:
|
||||||
|
return content
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _filter_supported_transcribe_kwargs(model: Any, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
available = set(model.get_params().keys())
|
||||||
|
except Exception:
|
||||||
|
return kwargs
|
||||||
|
return {key: value for key, value in kwargs.items() if key in available}
|
||||||
|
|
||||||
|
def transcribe_file(self, file_path: str, language: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
if not STT_ENABLED:
|
||||||
|
raise SpeechDisabledError("Speech-to-text is disabled")
|
||||||
|
target = str(file_path or "").strip()
|
||||||
|
if not target or not os.path.isfile(target):
|
||||||
|
raise SpeechServiceError("Audio file not found")
|
||||||
|
|
||||||
|
duration_seconds = self._probe_audio_duration_seconds(target)
|
||||||
|
if duration_seconds is not None and duration_seconds > float(STT_MAX_AUDIO_SECONDS) + 0.3:
|
||||||
|
raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
|
||||||
|
|
||||||
|
prepared_target = self._preprocess_audio(target)
|
||||||
|
try:
|
||||||
|
model = self._load_model()
|
||||||
|
lang = str(language or "").strip().lower()
|
||||||
|
normalized_lang: Optional[str] = None
|
||||||
|
if lang and lang not in {"auto", "null", "none"}:
|
||||||
|
normalized_lang = lang
|
||||||
|
|
||||||
|
max_end = 0.0
|
||||||
|
detected_language = ""
|
||||||
|
texts = []
|
||||||
|
kwargs: Dict[str, Any] = {
|
||||||
|
"print_realtime": False,
|
||||||
|
"print_progress": False,
|
||||||
|
"no_context": True,
|
||||||
|
"suppress_non_speech_tokens": True,
|
||||||
|
}
|
||||||
|
if normalized_lang:
|
||||||
|
kwargs["language"] = normalized_lang
|
||||||
|
initial_prompt = str(STT_INITIAL_PROMPT or "").strip()
|
||||||
|
if initial_prompt:
|
||||||
|
kwargs["initial_prompt"] = initial_prompt
|
||||||
|
kwargs = self._filter_supported_transcribe_kwargs(model, kwargs)
|
||||||
|
try:
|
||||||
|
segments = model.transcribe(prepared_target, **kwargs)
|
||||||
|
except Exception as exc:
|
||||||
|
raise SpeechServiceError(
|
||||||
|
f"pywhispercpp transcription failed: {exc}. "
|
||||||
|
"If input is not wav, install ffmpeg in runtime image."
|
||||||
|
) from exc
|
||||||
|
for segment in segments:
|
||||||
|
txt = str(getattr(segment, "text", "") or "").strip()
|
||||||
|
if txt:
|
||||||
|
texts.append(txt)
|
||||||
|
if normalized_lang:
|
||||||
|
detected_language = normalized_lang
|
||||||
|
try:
|
||||||
|
max_end = max(max_end, float(getattr(segment, "t1", 0.0) or 0.0) / 100.0)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if max_end > float(STT_MAX_AUDIO_SECONDS) + 0.3:
|
||||||
|
raise SpeechDurationError(f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
|
||||||
|
|
||||||
|
text = self._normalize_text(" ".join(texts).strip())
|
||||||
|
if not text:
|
||||||
|
raise SpeechServiceError("No speech detected")
|
||||||
|
|
||||||
|
if duration_seconds is None:
|
||||||
|
duration_seconds = max_end if max_end > 0 else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text": text,
|
||||||
|
"language": detected_language or None,
|
||||||
|
"duration_seconds": duration_seconds,
|
||||||
|
"max_audio_seconds": STT_MAX_AUDIO_SECONDS,
|
||||||
|
"model": STT_MODEL,
|
||||||
|
"device": STT_DEVICE,
|
||||||
|
"backend": self._backend or "unknown",
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
if prepared_target != target and os.path.exists(prepared_target):
|
||||||
|
try:
|
||||||
|
os.remove(prepared_target)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
119
backend/main.py
119
backend/main.py
|
|
@ -1,5 +1,6 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
@ -12,7 +13,7 @@ from urllib.parse import unquote
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from fastapi import Depends, FastAPI, File, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
|
from fastapi import Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, WebSocket, WebSocketDisconnect
|
||||||
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
|
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from sqlmodel import Session, select
|
from sqlmodel import Session, select
|
||||||
|
|
@ -21,6 +22,12 @@ from core.config_manager import BotConfigManager
|
||||||
from core.cache import cache
|
from core.cache import cache
|
||||||
from core.database import engine, get_session, init_database
|
from core.database import engine, get_session, init_database
|
||||||
from core.docker_manager import BotDockerManager
|
from core.docker_manager import BotDockerManager
|
||||||
|
from core.speech_service import (
|
||||||
|
SpeechDisabledError,
|
||||||
|
SpeechDurationError,
|
||||||
|
SpeechServiceError,
|
||||||
|
WhisperSpeechService,
|
||||||
|
)
|
||||||
from core.settings import (
|
from core.settings import (
|
||||||
BOTS_WORKSPACE_ROOT,
|
BOTS_WORKSPACE_ROOT,
|
||||||
DATA_ROOT,
|
DATA_ROOT,
|
||||||
|
|
@ -37,11 +44,17 @@ from core.settings import (
|
||||||
REDIS_ENABLED,
|
REDIS_ENABLED,
|
||||||
REDIS_PREFIX,
|
REDIS_PREFIX,
|
||||||
REDIS_URL,
|
REDIS_URL,
|
||||||
|
STT_DEVICE,
|
||||||
|
STT_DEFAULT_LANGUAGE,
|
||||||
|
STT_ENABLED,
|
||||||
|
STT_MAX_AUDIO_SECONDS,
|
||||||
|
STT_MODEL,
|
||||||
UPLOAD_MAX_MB,
|
UPLOAD_MAX_MB,
|
||||||
)
|
)
|
||||||
from models.bot import BotInstance, BotMessage, NanobotImage
|
from models.bot import BotInstance, BotMessage, NanobotImage
|
||||||
|
|
||||||
app = FastAPI(title="Dashboard Nanobot API")
|
app = FastAPI(title="Dashboard Nanobot API")
|
||||||
|
logger = logging.getLogger("dashboard.backend")
|
||||||
|
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
|
|
@ -55,6 +68,7 @@ os.makedirs(DATA_ROOT, exist_ok=True)
|
||||||
|
|
||||||
docker_manager = BotDockerManager(host_data_root=BOTS_WORKSPACE_ROOT)
|
docker_manager = BotDockerManager(host_data_root=BOTS_WORKSPACE_ROOT)
|
||||||
config_manager = BotConfigManager(host_data_root=BOTS_WORKSPACE_ROOT)
|
config_manager = BotConfigManager(host_data_root=BOTS_WORKSPACE_ROOT)
|
||||||
|
speech_service = WhisperSpeechService()
|
||||||
BOT_ID_PATTERN = re.compile(r"^[A-Za-z0-9_]+$")
|
BOT_ID_PATTERN = re.compile(r"^[A-Za-z0-9_]+$")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -501,6 +515,13 @@ def get_system_defaults():
|
||||||
"limits": {
|
"limits": {
|
||||||
"upload_max_mb": UPLOAD_MAX_MB,
|
"upload_max_mb": UPLOAD_MAX_MB,
|
||||||
},
|
},
|
||||||
|
"speech": {
|
||||||
|
"enabled": STT_ENABLED,
|
||||||
|
"model": STT_MODEL,
|
||||||
|
"device": STT_DEVICE,
|
||||||
|
"max_audio_seconds": STT_MAX_AUDIO_SECONDS,
|
||||||
|
"default_language": STT_DEFAULT_LANGUAGE,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -3117,6 +3138,102 @@ async def upload_workspace_files(
|
||||||
return {"bot_id": bot_id, "files": rows}
|
return {"bot_id": bot_id, "files": rows}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/bots/{bot_id}/speech/transcribe")
|
||||||
|
async def transcribe_bot_speech(
|
||||||
|
bot_id: str,
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
language: Optional[str] = Form(None),
|
||||||
|
session: Session = Depends(get_session),
|
||||||
|
):
|
||||||
|
bot = session.get(BotInstance, bot_id)
|
||||||
|
if not bot:
|
||||||
|
raise HTTPException(status_code=404, detail="Bot not found")
|
||||||
|
if not STT_ENABLED:
|
||||||
|
raise HTTPException(status_code=400, detail="Speech recognition is disabled")
|
||||||
|
if not file:
|
||||||
|
raise HTTPException(status_code=400, detail="no audio file uploaded")
|
||||||
|
|
||||||
|
original_name = str(file.filename or "audio.webm").strip() or "audio.webm"
|
||||||
|
safe_name = os.path.basename(original_name).replace("\\", "_").replace("/", "_")
|
||||||
|
ext = os.path.splitext(safe_name)[1].strip().lower() or ".webm"
|
||||||
|
if len(ext) > 12:
|
||||||
|
ext = ".webm"
|
||||||
|
|
||||||
|
tmp_path = ""
|
||||||
|
try:
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix=".speech_", dir=DATA_ROOT) as tmp:
|
||||||
|
tmp_path = tmp.name
|
||||||
|
while True:
|
||||||
|
chunk = await file.read(1024 * 1024)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
tmp.write(chunk)
|
||||||
|
|
||||||
|
if not tmp_path or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) <= 0:
|
||||||
|
raise HTTPException(status_code=400, detail="audio payload is empty")
|
||||||
|
|
||||||
|
resolved_language = str(language or "").strip() or STT_DEFAULT_LANGUAGE
|
||||||
|
result = await asyncio.to_thread(speech_service.transcribe_file, tmp_path, resolved_language)
|
||||||
|
text = str(result.get("text") or "").strip()
|
||||||
|
if not text:
|
||||||
|
raise HTTPException(status_code=400, detail="No speech detected")
|
||||||
|
return {
|
||||||
|
"bot_id": bot_id,
|
||||||
|
"text": text,
|
||||||
|
"duration_seconds": result.get("duration_seconds"),
|
||||||
|
"max_audio_seconds": STT_MAX_AUDIO_SECONDS,
|
||||||
|
"model": STT_MODEL,
|
||||||
|
"device": STT_DEVICE,
|
||||||
|
"language": result.get("language") or resolved_language,
|
||||||
|
}
|
||||||
|
except SpeechDisabledError as exc:
|
||||||
|
logger.warning(
|
||||||
|
"speech transcribe disabled bot_id=%s file=%s language=%s detail=%s",
|
||||||
|
bot_id,
|
||||||
|
safe_name,
|
||||||
|
language,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc))
|
||||||
|
except SpeechDurationError:
|
||||||
|
logger.warning(
|
||||||
|
"speech transcribe too long bot_id=%s file=%s language=%s max_seconds=%s",
|
||||||
|
bot_id,
|
||||||
|
safe_name,
|
||||||
|
language,
|
||||||
|
STT_MAX_AUDIO_SECONDS,
|
||||||
|
)
|
||||||
|
raise HTTPException(status_code=413, detail=f"Audio duration exceeds {STT_MAX_AUDIO_SECONDS} seconds")
|
||||||
|
except SpeechServiceError as exc:
|
||||||
|
logger.exception(
|
||||||
|
"speech transcribe failed bot_id=%s file=%s language=%s",
|
||||||
|
bot_id,
|
||||||
|
safe_name,
|
||||||
|
language,
|
||||||
|
)
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc))
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(
|
||||||
|
"speech transcribe unexpected error bot_id=%s file=%s language=%s",
|
||||||
|
bot_id,
|
||||||
|
safe_name,
|
||||||
|
language,
|
||||||
|
)
|
||||||
|
raise HTTPException(status_code=500, detail=f"speech transcription failed: {exc}")
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
await file.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if tmp_path and os.path.exists(tmp_path):
|
||||||
|
try:
|
||||||
|
os.remove(tmp_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@app.websocket("/ws/monitor/{bot_id}")
|
@app.websocket("/ws/monitor/{bot_id}")
|
||||||
async def websocket_endpoint(websocket: WebSocket, bot_id: str):
|
async def websocket_endpoint(websocket: WebSocket, bot_id: str):
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
|
|
|
||||||
|
|
@ -15,3 +15,5 @@ watchfiles==0.21.0
|
||||||
urllib3==1.26.18
|
urllib3==1.26.18
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
redis==5.0.8
|
redis==5.0.8
|
||||||
|
opencc-purepy==1.1.0
|
||||||
|
pywhispercpp==1.3.1
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,16 @@ services:
|
||||||
REDIS_PREFIX: ${REDIS_PREFIX:-dashboard_nanobot}
|
REDIS_PREFIX: ${REDIS_PREFIX:-dashboard_nanobot}
|
||||||
REDIS_DEFAULT_TTL: ${REDIS_DEFAULT_TTL:-60}
|
REDIS_DEFAULT_TTL: ${REDIS_DEFAULT_TTL:-60}
|
||||||
PANEL_ACCESS_PASSWORD: ${PANEL_ACCESS_PASSWORD:-}
|
PANEL_ACCESS_PASSWORD: ${PANEL_ACCESS_PASSWORD:-}
|
||||||
|
STT_ENABLED: ${STT_ENABLED:-true}
|
||||||
|
STT_MODEL: ${STT_MODEL:-ggml-small-q8_0.bin}
|
||||||
|
STT_MODEL_DIR: ${STT_MODEL_DIR:-${HOST_DATA_ROOT}/model}
|
||||||
|
STT_DEVICE: ${STT_DEVICE:-cpu}
|
||||||
|
STT_MAX_AUDIO_SECONDS: ${STT_MAX_AUDIO_SECONDS:-20}
|
||||||
|
STT_DEFAULT_LANGUAGE: ${STT_DEFAULT_LANGUAGE:-zh}
|
||||||
|
STT_FORCE_SIMPLIFIED: ${STT_FORCE_SIMPLIFIED:-true}
|
||||||
|
STT_AUDIO_PREPROCESS: ${STT_AUDIO_PREPROCESS:-true}
|
||||||
|
STT_AUDIO_FILTER: ${STT_AUDIO_FILTER:-highpass=f=120,lowpass=f=7600,afftdn=nf=-20}
|
||||||
|
STT_INITIAL_PROMPT: ${STT_INITIAL_PROMPT:-以下内容可能包含简体中文和英文术语。请优先输出简体中文,英文单词、缩写、品牌名和数字保持原文,不要翻译。}
|
||||||
volumes:
|
volumes:
|
||||||
- /var/run/docker.sock:/var/run/docker.sock
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
- ${HOST_DATA_ROOT}:${HOST_DATA_ROOT}
|
- ${HOST_DATA_ROOT}:${HOST_DATA_ROOT}
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,19 @@ export const dashboardEn = {
|
||||||
copyPromptFail: 'Failed to copy prompt.',
|
copyPromptFail: 'Failed to copy prompt.',
|
||||||
editPromptDone: 'Inserted into composer.',
|
editPromptDone: 'Inserted into composer.',
|
||||||
voiceInput: 'Voice input',
|
voiceInput: 'Voice input',
|
||||||
voiceUnavailable: 'Voice input is not available yet.',
|
textInput: 'Text input',
|
||||||
|
voiceUnavailable: 'Speech recognition is disabled.',
|
||||||
|
voiceUnsupported: 'Your browser does not support audio recording.',
|
||||||
|
voicePermissionDenied: 'Microphone permission denied. Please allow access in browser settings.',
|
||||||
|
voiceRecordFail: 'Audio recording failed. Please retry.',
|
||||||
|
voiceReady: 'Click the mic to start recording',
|
||||||
|
voiceRecording: 'Recording...',
|
||||||
|
voiceTranscribing: 'Transcribing...',
|
||||||
|
voiceStart: 'Start recording',
|
||||||
|
voiceStop: 'Stop recording',
|
||||||
|
voiceTranscribeDone: 'Voice converted to text.',
|
||||||
|
voiceTranscribeEmpty: 'No valid speech detected.',
|
||||||
|
voiceTranscribeFail: 'Speech transcription failed.',
|
||||||
copyReply: 'Copy reply',
|
copyReply: 'Copy reply',
|
||||||
copyReplyDone: 'Reply copied.',
|
copyReplyDone: 'Reply copied.',
|
||||||
copyReplyFail: 'Failed to copy reply.',
|
copyReplyFail: 'Failed to copy reply.',
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,19 @@ export const dashboardZhCn = {
|
||||||
copyPromptFail: '复制指令失败。',
|
copyPromptFail: '复制指令失败。',
|
||||||
editPromptDone: '已填入输入框。',
|
editPromptDone: '已填入输入框。',
|
||||||
voiceInput: '语音输入',
|
voiceInput: '语音输入',
|
||||||
voiceUnavailable: '语音输入暂未接入。',
|
textInput: '文字输入',
|
||||||
|
voiceUnavailable: '语音识别未启用。',
|
||||||
|
voiceUnsupported: '当前浏览器不支持录音。',
|
||||||
|
voicePermissionDenied: '麦克风权限被拒绝,请在浏览器设置中允许访问。',
|
||||||
|
voiceRecordFail: '录音失败,请重试。',
|
||||||
|
voiceReady: '点击麦克风开始录音',
|
||||||
|
voiceRecording: '录音中...',
|
||||||
|
voiceTranscribing: '语音识别中...',
|
||||||
|
voiceStart: '开始录音',
|
||||||
|
voiceStop: '停止录音',
|
||||||
|
voiceTranscribeDone: '语音已转为文本。',
|
||||||
|
voiceTranscribeEmpty: '未识别到有效语音内容。',
|
||||||
|
voiceTranscribeFail: '语音识别失败。',
|
||||||
copyReply: '复制回复',
|
copyReply: '复制回复',
|
||||||
copyReplyDone: '回复已复制。',
|
copyReplyDone: '回复已复制。',
|
||||||
copyReplyFail: '复制回复失败。',
|
copyReplyFail: '复制回复失败。',
|
||||||
|
|
|
||||||
|
|
@ -1085,17 +1085,126 @@
|
||||||
padding: 14px 120px 42px 14px;
|
padding: 14px 120px 42px 14px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.ops-voice-panel {
|
||||||
|
min-height: 96px;
|
||||||
|
border: 1px dashed color-mix(in oklab, var(--line) 72%, var(--brand) 28%);
|
||||||
|
border-radius: 12px;
|
||||||
|
background: color-mix(in oklab, var(--panel) 78%, var(--panel-soft) 22%);
|
||||||
|
padding: 12px 14px 12px 14px;
|
||||||
|
display: grid;
|
||||||
|
align-content: center;
|
||||||
|
gap: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-title {
|
||||||
|
font-size: 13px;
|
||||||
|
font-weight: 700;
|
||||||
|
color: var(--muted);
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave {
|
||||||
|
height: 28px;
|
||||||
|
border-radius: 999px;
|
||||||
|
border: 1px solid color-mix(in oklab, var(--line) 76%, transparent);
|
||||||
|
background: color-mix(in oklab, var(--panel-soft) 78%, var(--panel) 22%);
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
padding: 0 6px;
|
||||||
|
overflow: hidden;
|
||||||
|
flex: 1 1 auto;
|
||||||
|
min-width: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave-segment {
|
||||||
|
height: 100%;
|
||||||
|
min-width: 0;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: space-between;
|
||||||
|
gap: 2px;
|
||||||
|
padding: 0 6px;
|
||||||
|
border-radius: 999px;
|
||||||
|
background: color-mix(in oklab, var(--panel) 60%, rgba(255, 255, 255, 0.18) 40%);
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave.is-mobile .ops-voice-wave-segment {
|
||||||
|
flex: 1 1 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave.is-desktop .ops-voice-wave-segment {
|
||||||
|
flex: 1 1 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave-segment i {
|
||||||
|
display: inline-block;
|
||||||
|
width: 3px;
|
||||||
|
min-width: 3px;
|
||||||
|
height: 10px;
|
||||||
|
border-radius: 999px;
|
||||||
|
background: color-mix(in oklab, var(--line) 72%, var(--text) 28%);
|
||||||
|
opacity: 0.72;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave-segment i:nth-child(3n) {
|
||||||
|
height: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave-segment i:nth-child(4n) {
|
||||||
|
height: 18px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave-segment i:nth-child(5n) {
|
||||||
|
height: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave.is-live .ops-voice-wave-segment i {
|
||||||
|
background: color-mix(in oklab, var(--brand) 60%, #8ec3ff 40%);
|
||||||
|
animation: ops-voice-wave 1.05s ease-in-out infinite;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-countdown {
|
||||||
|
flex: 0 0 auto;
|
||||||
|
font-size: 13px;
|
||||||
|
font-weight: 700;
|
||||||
|
color: var(--title);
|
||||||
|
min-width: 44px;
|
||||||
|
text-align: right;
|
||||||
|
}
|
||||||
|
|
||||||
.ops-composer-tools-right {
|
.ops-composer-tools-right {
|
||||||
position: absolute;
|
position: absolute;
|
||||||
bottom: 14px;
|
bottom: 14px;
|
||||||
display: inline-flex;
|
left: 12px;
|
||||||
|
right: 12px;
|
||||||
|
display: flex;
|
||||||
align-items: center;
|
align-items: center;
|
||||||
|
justify-content: flex-end;
|
||||||
gap: 6px;
|
gap: 6px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.ops-composer-tools-right {
|
.ops-composer-tools-right {
|
||||||
right: 12px;
|
width: auto;
|
||||||
max-width: calc(100% - 24px);
|
}
|
||||||
|
|
||||||
|
.ops-voice-inline {
|
||||||
|
min-width: 0;
|
||||||
|
flex: 1 1 auto;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
margin-right: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 720px) {
|
||||||
|
.ops-voice-wave {
|
||||||
|
gap: 4px;
|
||||||
|
padding: 0 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ops-voice-wave-segment {
|
||||||
|
padding: 0 4px;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
.ops-composer-inline-btn {
|
.ops-composer-inline-btn {
|
||||||
|
|
@ -1116,6 +1225,11 @@
|
||||||
color: var(--icon);
|
color: var(--icon);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.ops-composer-inline-btn.is-active {
|
||||||
|
background: color-mix(in oklab, var(--brand-soft) 42%, var(--panel) 58%);
|
||||||
|
color: var(--brand);
|
||||||
|
}
|
||||||
|
|
||||||
.ops-composer-submit-btn {
|
.ops-composer-submit-btn {
|
||||||
width: 34px;
|
width: 34px;
|
||||||
height: 34px;
|
height: 34px;
|
||||||
|
|
@ -1224,6 +1338,17 @@
|
||||||
100% { transform: translateX(430%); }
|
100% { transform: translateX(430%); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@keyframes ops-voice-wave {
|
||||||
|
0%, 100% {
|
||||||
|
transform: scaleY(0.55);
|
||||||
|
opacity: 0.35;
|
||||||
|
}
|
||||||
|
50% {
|
||||||
|
transform: scaleY(1.95);
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
.ops-pending-chip {
|
.ops-pending-chip {
|
||||||
display: inline-flex;
|
display: inline-flex;
|
||||||
align-items: center;
|
align-items: center;
|
||||||
|
|
|
||||||
|
|
@ -249,6 +249,12 @@ interface SystemDefaultsResponse {
|
||||||
limits?: {
|
limits?: {
|
||||||
upload_max_mb?: number;
|
upload_max_mb?: number;
|
||||||
};
|
};
|
||||||
|
speech?: {
|
||||||
|
enabled?: boolean;
|
||||||
|
model?: string;
|
||||||
|
device?: string;
|
||||||
|
max_audio_seconds?: number;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
type BotEnvParams = Record<string, string>;
|
type BotEnvParams = Record<string, string>;
|
||||||
|
|
@ -719,6 +725,11 @@ export function BotDashboardModule({
|
||||||
const fileNotPreviewableLabel = locale === 'zh' ? '当前文件类型不支持预览' : 'This file type is not previewable';
|
const fileNotPreviewableLabel = locale === 'zh' ? '当前文件类型不支持预览' : 'This file type is not previewable';
|
||||||
const [selectedBotId, setSelectedBotId] = useState('');
|
const [selectedBotId, setSelectedBotId] = useState('');
|
||||||
const [command, setCommand] = useState('');
|
const [command, setCommand] = useState('');
|
||||||
|
const [speechEnabled, setSpeechEnabled] = useState(true);
|
||||||
|
const [voiceMaxSeconds, setVoiceMaxSeconds] = useState(20);
|
||||||
|
const [isVoiceRecording, setIsVoiceRecording] = useState(false);
|
||||||
|
const [isVoiceTranscribing, setIsVoiceTranscribing] = useState(false);
|
||||||
|
const [voiceCountdown, setVoiceCountdown] = useState(20);
|
||||||
const [isSaving, setIsSaving] = useState(false);
|
const [isSaving, setIsSaving] = useState(false);
|
||||||
const [showBaseModal, setShowBaseModal] = useState(false);
|
const [showBaseModal, setShowBaseModal] = useState(false);
|
||||||
const [showParamModal, setShowParamModal] = useState(false);
|
const [showParamModal, setShowParamModal] = useState(false);
|
||||||
|
|
@ -798,6 +809,10 @@ export function BotDashboardModule({
|
||||||
const [feedbackSavingByMessageId, setFeedbackSavingByMessageId] = useState<Record<number, boolean>>({});
|
const [feedbackSavingByMessageId, setFeedbackSavingByMessageId] = useState<Record<number, boolean>>({});
|
||||||
const [showRuntimeActionModal, setShowRuntimeActionModal] = useState(false);
|
const [showRuntimeActionModal, setShowRuntimeActionModal] = useState(false);
|
||||||
const [workspaceHoverCard, setWorkspaceHoverCard] = useState<WorkspaceHoverCardState | null>(null);
|
const [workspaceHoverCard, setWorkspaceHoverCard] = useState<WorkspaceHoverCardState | null>(null);
|
||||||
|
const voiceRecorderRef = useRef<MediaRecorder | null>(null);
|
||||||
|
const voiceStreamRef = useRef<MediaStream | null>(null);
|
||||||
|
const voiceChunksRef = useRef<BlobPart[]>([]);
|
||||||
|
const voiceTimerRef = useRef<number | null>(null);
|
||||||
const runtimeMenuRef = useRef<HTMLDivElement | null>(null);
|
const runtimeMenuRef = useRef<HTMLDivElement | null>(null);
|
||||||
const botOrderRef = useRef<Record<string, number>>({});
|
const botOrderRef = useRef<Record<string, number>>({});
|
||||||
const nextBotOrderRef = useRef(1);
|
const nextBotOrderRef = useRef(1);
|
||||||
|
|
@ -1544,16 +1559,36 @@ export function BotDashboardModule({
|
||||||
persistComposerDraft(selectedBotId, command, pendingAttachments);
|
persistComposerDraft(selectedBotId, command, pendingAttachments);
|
||||||
}, [selectedBotId, composerDraftHydrated, command, pendingAttachments]);
|
}, [selectedBotId, composerDraftHydrated, command, pendingAttachments]);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
return () => {
|
||||||
|
clearVoiceTimer();
|
||||||
|
try {
|
||||||
|
if (voiceRecorderRef.current && voiceRecorderRef.current.state !== 'inactive') {
|
||||||
|
voiceRecorderRef.current.stop();
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
releaseVoiceStream();
|
||||||
|
};
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (!isVoiceRecording && !isVoiceTranscribing) {
|
||||||
|
setVoiceCountdown(voiceMaxSeconds);
|
||||||
|
}
|
||||||
|
}, [voiceMaxSeconds, isVoiceRecording, isVoiceTranscribing]);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
const hasDraft = Boolean(String(command || '').trim()) || pendingAttachments.length > 0 || Boolean(quotedReply);
|
const hasDraft = Boolean(String(command || '').trim()) || pendingAttachments.length > 0 || Boolean(quotedReply);
|
||||||
if (!hasDraft && !isUploadingAttachments) return;
|
if (!hasDraft && !isUploadingAttachments && !isVoiceRecording && !isVoiceTranscribing) return;
|
||||||
const onBeforeUnload = (event: BeforeUnloadEvent) => {
|
const onBeforeUnload = (event: BeforeUnloadEvent) => {
|
||||||
event.preventDefault();
|
event.preventDefault();
|
||||||
event.returnValue = '';
|
event.returnValue = '';
|
||||||
};
|
};
|
||||||
window.addEventListener('beforeunload', onBeforeUnload);
|
window.addEventListener('beforeunload', onBeforeUnload);
|
||||||
return () => window.removeEventListener('beforeunload', onBeforeUnload);
|
return () => window.removeEventListener('beforeunload', onBeforeUnload);
|
||||||
}, [command, pendingAttachments.length, quotedReply, isUploadingAttachments]);
|
}, [command, pendingAttachments.length, quotedReply, isUploadingAttachments, isVoiceRecording, isVoiceTranscribing]);
|
||||||
|
|
||||||
const syncChatScrollToBottom = useCallback((behavior: ScrollBehavior = 'auto') => {
|
const syncChatScrollToBottom = useCallback((behavior: ScrollBehavior = 'auto') => {
|
||||||
const box = chatScrollRef.current;
|
const box = chatScrollRef.current;
|
||||||
|
|
@ -1580,6 +1615,9 @@ export function BotDashboardModule({
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
setQuotedReply(null);
|
setQuotedReply(null);
|
||||||
|
if (isVoiceRecording) {
|
||||||
|
stopVoiceRecording();
|
||||||
|
}
|
||||||
}, [selectedBotId]);
|
}, [selectedBotId]);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
|
|
@ -1637,9 +1675,21 @@ export function BotDashboardModule({
|
||||||
const loadSystemDefaults = async () => {
|
const loadSystemDefaults = async () => {
|
||||||
try {
|
try {
|
||||||
const res = await axios.get<SystemDefaultsResponse>(`${APP_ENDPOINTS.apiBase}/system/defaults`);
|
const res = await axios.get<SystemDefaultsResponse>(`${APP_ENDPOINTS.apiBase}/system/defaults`);
|
||||||
|
if (!alive) return;
|
||||||
const configured = Number(res.data?.limits?.upload_max_mb);
|
const configured = Number(res.data?.limits?.upload_max_mb);
|
||||||
if (!Number.isFinite(configured) || configured <= 0 || !alive) return;
|
if (Number.isFinite(configured) && configured > 0) {
|
||||||
setUploadMaxMb(Math.max(1, Math.floor(configured)));
|
setUploadMaxMb(Math.max(1, Math.floor(configured)));
|
||||||
|
}
|
||||||
|
const speechEnabledRaw = res.data?.speech?.enabled;
|
||||||
|
if (typeof speechEnabledRaw === 'boolean') {
|
||||||
|
setSpeechEnabled(speechEnabledRaw);
|
||||||
|
}
|
||||||
|
const speechSeconds = Number(res.data?.speech?.max_audio_seconds);
|
||||||
|
if (Number.isFinite(speechSeconds) && speechSeconds > 0) {
|
||||||
|
const normalized = Math.max(5, Math.floor(speechSeconds));
|
||||||
|
setVoiceMaxSeconds(normalized);
|
||||||
|
setVoiceCountdown(normalized);
|
||||||
|
}
|
||||||
} catch {
|
} catch {
|
||||||
// keep default limit
|
// keep default limit
|
||||||
}
|
}
|
||||||
|
|
@ -2642,8 +2692,155 @@ export function BotDashboardModule({
|
||||||
filePickerRef.current?.click();
|
filePickerRef.current?.click();
|
||||||
};
|
};
|
||||||
|
|
||||||
const onVoiceInput = () => {
|
const clearVoiceTimer = () => {
|
||||||
|
if (voiceTimerRef.current) {
|
||||||
|
window.clearInterval(voiceTimerRef.current);
|
||||||
|
voiceTimerRef.current = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const releaseVoiceStream = () => {
|
||||||
|
if (voiceStreamRef.current) {
|
||||||
|
voiceStreamRef.current.getTracks().forEach((track) => {
|
||||||
|
try {
|
||||||
|
track.stop();
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
});
|
||||||
|
voiceStreamRef.current = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const transcribeVoiceBlob = async (blob: Blob) => {
|
||||||
|
if (!selectedBot || blob.size <= 0) return;
|
||||||
|
setIsVoiceTranscribing(true);
|
||||||
|
try {
|
||||||
|
const mime = String(blob.type || '').toLowerCase();
|
||||||
|
const ext = mime.includes('ogg') ? 'ogg' : mime.includes('mp4') ? 'mp4' : 'webm';
|
||||||
|
const file = new File([blob], `voice-input-${Date.now()}.${ext}`, { type: blob.type || 'audio/webm' });
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append('file', file);
|
||||||
|
formData.append('language', 'zh');
|
||||||
|
const res = await axios.post<{ text?: string }>(
|
||||||
|
`${APP_ENDPOINTS.apiBase}/bots/${selectedBot.id}/speech/transcribe`,
|
||||||
|
formData,
|
||||||
|
{ timeout: 120000 },
|
||||||
|
);
|
||||||
|
const text = normalizeUserMessageText(String(res.data?.text || ''));
|
||||||
|
if (!text) {
|
||||||
|
notify(t.voiceTranscribeEmpty, { tone: 'warning' });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
setCommand((prev) => {
|
||||||
|
const base = String(prev || '').trim();
|
||||||
|
if (!base) return text;
|
||||||
|
return `${base}\n${text}`;
|
||||||
|
});
|
||||||
|
window.requestAnimationFrame(() => composerTextareaRef.current?.focus());
|
||||||
|
notify(t.voiceTranscribeDone, { tone: 'success' });
|
||||||
|
} catch (error: any) {
|
||||||
|
const msg = String(error?.response?.data?.detail || '').trim();
|
||||||
|
console.error('Speech transcription failed', {
|
||||||
|
botId: selectedBot.id,
|
||||||
|
message: msg || t.voiceTranscribeFail,
|
||||||
|
status: error?.response?.status,
|
||||||
|
response: error?.response?.data,
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
notify(msg || t.voiceTranscribeFail, { tone: 'error' });
|
||||||
|
} finally {
|
||||||
|
setIsVoiceTranscribing(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const stopVoiceRecording = () => {
|
||||||
|
const recorder = voiceRecorderRef.current;
|
||||||
|
if (!recorder || recorder.state === 'inactive') return;
|
||||||
|
try {
|
||||||
|
recorder.stop();
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const startVoiceRecording = async () => {
|
||||||
|
if (!selectedBot || !canChat || isVoiceTranscribing) return;
|
||||||
|
if (!speechEnabled) {
|
||||||
notify(t.voiceUnavailable, { tone: 'warning' });
|
notify(t.voiceUnavailable, { tone: 'warning' });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (typeof window === 'undefined' || typeof navigator === 'undefined' || !navigator.mediaDevices?.getUserMedia) {
|
||||||
|
notify(t.voiceUnsupported, { tone: 'error' });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (typeof MediaRecorder === 'undefined') {
|
||||||
|
notify(t.voiceUnsupported, { tone: 'error' });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
const mimeCandidates = ['audio/webm;codecs=opus', 'audio/webm', 'audio/ogg;codecs=opus', 'audio/mp4'];
|
||||||
|
const supportedMime = mimeCandidates.find((candidate) => MediaRecorder.isTypeSupported(candidate));
|
||||||
|
const recorder = supportedMime
|
||||||
|
? new MediaRecorder(stream, { mimeType: supportedMime })
|
||||||
|
: new MediaRecorder(stream);
|
||||||
|
voiceStreamRef.current = stream;
|
||||||
|
voiceRecorderRef.current = recorder;
|
||||||
|
voiceChunksRef.current = [];
|
||||||
|
setVoiceCountdown(voiceMaxSeconds);
|
||||||
|
setIsVoiceRecording(true);
|
||||||
|
|
||||||
|
recorder.ondataavailable = (event: BlobEvent) => {
|
||||||
|
if (event.data && event.data.size > 0) {
|
||||||
|
voiceChunksRef.current.push(event.data);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
recorder.onerror = () => {
|
||||||
|
setIsVoiceRecording(false);
|
||||||
|
clearVoiceTimer();
|
||||||
|
releaseVoiceStream();
|
||||||
|
notify(t.voiceRecordFail, { tone: 'error' });
|
||||||
|
};
|
||||||
|
recorder.onstop = () => {
|
||||||
|
const blob = new Blob(voiceChunksRef.current, { type: supportedMime || recorder.mimeType || 'audio/webm' });
|
||||||
|
voiceRecorderRef.current = null;
|
||||||
|
voiceChunksRef.current = [];
|
||||||
|
clearVoiceTimer();
|
||||||
|
releaseVoiceStream();
|
||||||
|
setIsVoiceRecording(false);
|
||||||
|
setVoiceCountdown(voiceMaxSeconds);
|
||||||
|
if (blob.size > 0) {
|
||||||
|
void transcribeVoiceBlob(blob);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
recorder.start(200);
|
||||||
|
clearVoiceTimer();
|
||||||
|
voiceTimerRef.current = window.setInterval(() => {
|
||||||
|
setVoiceCountdown((prev) => {
|
||||||
|
if (prev <= 1) {
|
||||||
|
stopVoiceRecording();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return prev - 1;
|
||||||
|
});
|
||||||
|
}, 1000);
|
||||||
|
} catch {
|
||||||
|
releaseVoiceStream();
|
||||||
|
setIsVoiceRecording(false);
|
||||||
|
clearVoiceTimer();
|
||||||
|
notify(t.voicePermissionDenied, { tone: 'error' });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const onVoiceInput = () => {
|
||||||
|
if (isVoiceTranscribing) return;
|
||||||
|
if (isVoiceRecording) {
|
||||||
|
stopVoiceRecording();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
void startVoiceRecording();
|
||||||
};
|
};
|
||||||
|
|
||||||
const onPickAttachments = async (event: ChangeEvent<HTMLInputElement>) => {
|
const onPickAttachments = async (event: ChangeEvent<HTMLInputElement>) => {
|
||||||
|
|
@ -3393,7 +3590,7 @@ export function BotDashboardModule({
|
||||||
value={command}
|
value={command}
|
||||||
onChange={(e) => setCommand(e.target.value)}
|
onChange={(e) => setCommand(e.target.value)}
|
||||||
onKeyDown={onComposerKeyDown}
|
onKeyDown={onComposerKeyDown}
|
||||||
disabled={!canChat}
|
disabled={!canChat || isVoiceRecording || isVoiceTranscribing}
|
||||||
placeholder={
|
placeholder={
|
||||||
canChat
|
canChat
|
||||||
? t.inputPlaceholder
|
? t.inputPlaceholder
|
||||||
|
|
@ -3401,18 +3598,54 @@ export function BotDashboardModule({
|
||||||
}
|
}
|
||||||
/>
|
/>
|
||||||
<div className="ops-composer-tools-right">
|
<div className="ops-composer-tools-right">
|
||||||
<LucentIconButton
|
{(isVoiceRecording || isVoiceTranscribing) ? (
|
||||||
className="ops-composer-inline-btn"
|
<div className="ops-voice-inline" aria-live="polite">
|
||||||
disabled={!canChat}
|
<div className={`ops-voice-wave ${isVoiceRecording ? 'is-live' : ''} ${isCompactMobile ? 'is-mobile' : 'is-desktop'}`}>
|
||||||
|
{Array.from({ length: isCompactMobile ? 1 : 5 }).map((_, segmentIdx) => (
|
||||||
|
<div key={`vw-segment-${segmentIdx}`} className="ops-voice-wave-segment">
|
||||||
|
{Array.from({ length: isCompactMobile ? 28 : 18 }).map((_, idx) => {
|
||||||
|
const delayIndex = isCompactMobile
|
||||||
|
? idx
|
||||||
|
: (segmentIdx * 18) + idx;
|
||||||
|
return (
|
||||||
|
<i
|
||||||
|
key={`vw-inline-${segmentIdx}-${idx}`}
|
||||||
|
style={{ animationDelay: `${(delayIndex % 14) * 0.06}s` }}
|
||||||
|
/>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
<div className="ops-voice-countdown mono">
|
||||||
|
{isVoiceRecording ? `${voiceCountdown}s` : t.voiceTranscribing}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : null}
|
||||||
|
<button
|
||||||
|
className={`ops-composer-inline-btn ${isVoiceRecording ? 'is-recording' : ''}`}
|
||||||
|
disabled={!canChat || isVoiceTranscribing || (!speechEnabled && !isVoiceRecording)}
|
||||||
onClick={onVoiceInput}
|
onClick={onVoiceInput}
|
||||||
tooltip={t.voiceInput}
|
aria-label={isVoiceRecording ? t.voiceStop : t.voiceStart}
|
||||||
aria-label={t.voiceInput}
|
title={
|
||||||
|
isVoiceTranscribing
|
||||||
|
? t.voiceTranscribing
|
||||||
|
: isVoiceRecording
|
||||||
|
? t.voiceStop
|
||||||
|
: t.voiceStart
|
||||||
|
}
|
||||||
>
|
>
|
||||||
|
{isVoiceTranscribing ? (
|
||||||
|
<RefreshCw size={16} className="animate-spin" />
|
||||||
|
) : isVoiceRecording ? (
|
||||||
|
<Square size={16} />
|
||||||
|
) : (
|
||||||
<Mic size={16} />
|
<Mic size={16} />
|
||||||
</LucentIconButton>
|
)}
|
||||||
|
</button>
|
||||||
<LucentIconButton
|
<LucentIconButton
|
||||||
className="ops-composer-inline-btn"
|
className="ops-composer-inline-btn"
|
||||||
disabled={!canChat || isUploadingAttachments}
|
disabled={!canChat || isUploadingAttachments || isVoiceRecording || isVoiceTranscribing}
|
||||||
onClick={triggerPickAttachments}
|
onClick={triggerPickAttachments}
|
||||||
tooltip={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
|
tooltip={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
|
||||||
aria-label={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
|
aria-label={isUploadingAttachments ? t.uploadingFile : t.uploadFile}
|
||||||
|
|
@ -3424,7 +3657,12 @@ export function BotDashboardModule({
|
||||||
disabled={
|
disabled={
|
||||||
isChatEnabled && (isThinking || isSending)
|
isChatEnabled && (isThinking || isSending)
|
||||||
? Boolean(interruptingByBot[selectedBot.id])
|
? Boolean(interruptingByBot[selectedBot.id])
|
||||||
: (!isChatEnabled || (!command.trim() && pendingAttachments.length === 0 && !quotedReply))
|
: (
|
||||||
|
!isChatEnabled
|
||||||
|
|| isVoiceRecording
|
||||||
|
|| isVoiceTranscribing
|
||||||
|
|| (!command.trim() && pendingAttachments.length === 0 && !quotedReply)
|
||||||
|
)
|
||||||
}
|
}
|
||||||
onClick={() => void (isChatEnabled && (isThinking || isSending) ? interruptExecution() : send())}
|
onClick={() => void (isChatEnabled && (isThinking || isSending) ? interruptExecution() : send())}
|
||||||
aria-label={isChatEnabled && (isThinking || isSending) ? t.interrupt : t.send}
|
aria-label={isChatEnabled && (isThinking || isSending) ? t.interrupt : t.send}
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ if [[ ! -f "$ENV_FILE" ]]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "[deploy] using env: $ENV_FILE"
|
echo "[deploy] using env: $ENV_FILE"
|
||||||
|
docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" config -q
|
||||||
docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" up -d --build
|
docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" up -d --build
|
||||||
|
|
||||||
echo "[deploy] service status"
|
echo "[deploy] service status"
|
||||||
|
|
|
||||||
|
|
@ -4,4 +4,9 @@ set -euo pipefail
|
||||||
ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
||||||
ENV_FILE="${1:-$ROOT_DIR/.env.prod}"
|
ENV_FILE="${1:-$ROOT_DIR/.env.prod}"
|
||||||
|
|
||||||
|
if [[ ! -f "$ENV_FILE" ]]; then
|
||||||
|
echo "Missing env file: $ENV_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" down
|
docker compose --env-file "$ENV_FILE" -f "$ROOT_DIR/docker-compose.prod.yml" down
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue