diff --git a/.DS_Store b/.DS_Store index 8ef9355..408a8dd 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/app.zip b/app.zip index 69aec27..139faa1 100644 Binary files a/app.zip and b/app.zip differ diff --git a/app/api/endpoints/meetings.py b/app/api/endpoints/meetings.py index 860720e..4e88c65 100644 --- a/app/api/endpoints/meetings.py +++ b/app/api/endpoints/meetings.py @@ -415,13 +415,13 @@ async def upload_audio(audio_file: UploadFile = File(...), meeting_id: int = For with get_db_connection() as connection: cursor = connection.cursor(dictionary=True) if replaced_existing and force_replace_bool: - cursor.execute("DELETE FROM transcript_segments WHERE meeting_id = %s", (meeting_id,)) - cursor.execute("DELETE FROM transcript_tasks WHERE meeting_id = %s", (meeting_id,)) + # 只删除旧的音频文件,转录数据由 start_transcription 统一处理 if existing_info and existing_info['file_path']: old_file_path = BASE_DIR / existing_info['file_path'].lstrip('/') if old_file_path.exists(): try: os.remove(old_file_path) + print(f"Deleted old audio file: {old_file_path}") except Exception as e: print(f"Warning: Failed to delete old file {old_file_path}: {e}") if replaced_existing: @@ -431,6 +431,7 @@ async def upload_audio(audio_file: UploadFile = File(...), meeting_id: int = For connection.commit() cursor.close() try: + # start_transcription 会自动删除旧的转录数据和任务记录 task_id = transcription_service.start_transcription(meeting_id, '/'+str(relative_path)) print(f"Transcription task started with ID: {task_id}") except Exception as e: diff --git a/app/services/async_transcription_service.py b/app/services/async_transcription_service.py index 1db34f5..cd4e50d 100644 --- a/app/services/async_transcription_service.py +++ b/app/services/async_transcription_service.py @@ -24,21 +24,43 @@ class AsyncTranscriptionService: def start_transcription(self, meeting_id: int, audio_file_path: str) -> str: """ 启动异步转录任务 - + Args: meeting_id: 会议ID audio_file_path: 音频文件相对路径 - + Returns: str: 业务任务ID """ try: - # 构造完整的文件URL + # 1. 删除该会议的旧转录数据和任务记录,并清空会议总结 + print(f"Cleaning old transcription data for meeting_id: {meeting_id}") + with get_db_connection() as connection: + cursor = connection.cursor() + + # 删除旧的转录文本段落 + cursor.execute("DELETE FROM transcript_segments WHERE meeting_id = %s", (meeting_id,)) + deleted_segments = cursor.rowcount + print(f"Deleted {deleted_segments} old transcript segments") + + # 删除旧的转录任务记录 + cursor.execute("DELETE FROM transcript_tasks WHERE meeting_id = %s", (meeting_id,)) + deleted_tasks = cursor.rowcount + print(f"Deleted {deleted_tasks} old transcript tasks") + + # 清空会议总结内容 + cursor.execute("UPDATE meetings SET summary = NULL WHERE meeting_id = %s", (meeting_id,)) + print(f"Cleared summary for meeting_id: {meeting_id}") + + connection.commit() + cursor.close() + + # 2. 构造完整的文件URL file_url = f"{self.base_url}{audio_file_path}" - + print(f"Starting transcription for meeting_id: {meeting_id}, file_url: {file_url}") - - # 调用Paraformer异步API + + # 3. 调用Paraformer异步API task_response = Transcription.async_call( model='paraformer-v2', file_urls=[file_url], @@ -47,15 +69,15 @@ class AsyncTranscriptionService: diarization_enabled=True, speaker_count=10 ) - + if task_response.status_code != HTTPStatus.OK: print(f"Failed to start transcription: {task_response.status_code}, {task_response.message}") raise Exception(f"Transcription API error: {task_response.message}") - + paraformer_task_id = task_response.output.task_id business_task_id = str(uuid.uuid4()) - - # 在Redis中存储任务映射 + + # 4. 在Redis中存储任务映射 current_time = datetime.now().isoformat() task_data = { 'business_task_id': business_task_id, @@ -67,17 +89,17 @@ class AsyncTranscriptionService: 'created_at': current_time, 'updated_at': current_time } - + # 存储到Redis,过期时间24小时 self.redis_client.hset(f"task:{business_task_id}", mapping=task_data) self.redis_client.expire(f"task:{business_task_id}", 86400) - - # 在数据库中创建任务记录 + + # 5. 在数据库中创建任务记录 self._save_task_to_db(business_task_id, paraformer_task_id, meeting_id, audio_file_path) - + print(f"Transcription task created: {business_task_id}") return business_task_id - + except Exception as e: print(f"Error starting transcription: {e}") raise e @@ -391,9 +413,9 @@ class AsyncTranscriptionService: cursor = connection.cursor() # 清除该会议的现有转录分段 - delete_query = "DELETE FROM transcript_segments WHERE meeting_id = %s" - cursor.execute(delete_query, (meeting_id,)) - print(f"Deleted existing segments for meeting_id: {meeting_id}") + # delete_query = "DELETE FROM transcript_segments WHERE meeting_id = %s" + # cursor.execute(delete_query, (meeting_id,)) + # print(f"Deleted existing segments for meeting_id: {meeting_id}") # 插入新的转录分段 insert_query = ''' diff --git a/app/services/voice_service.py b/app/services/voice_service.py deleted file mode 100644 index c0ce16b..0000000 --- a/app/services/voice_service.py +++ /dev/null @@ -1,108 +0,0 @@ -from http import HTTPStatus -import requests -import json -import dashscope -from dashscope.audio.asr import Transcription -from app.core.config import QWEN_API_KEY -from app.core.database import get_db_connection - -class VoiceService: - def __init__(self): - dashscope.api_key = QWEN_API_KEY - - def transcribe(self, file_urls: list[str], meeting_id: int): - print(f"Starting transcription for meeting_id: {meeting_id}, files: {file_urls}") - - try: - task_response = Transcription.async_call( - model='paraformer-v2', - file_urls=file_urls, - language_hints=['zh', 'en'], - disfluency_removal_enabled=True, - diarization_enabled=True, - speaker_count=10 - ) - - transcribe_response = Transcription.wait(task=task_response.output.task_id) - - if transcribe_response.status_code != HTTPStatus.OK: - print(f"Transcription failed: {transcribe_response.status_code}, {transcribe_response.message}") - return - - print("Transcription task submitted successfully!") - if not (transcribe_response.output and transcribe_response.output.get('results')): - print("No transcription results found in the response.") - return - - transcription_url = transcribe_response.output['results'][0]['transcription_url'] - print(f"Fetching transcription from URL: {transcription_url}") - - response = requests.get(transcription_url) - response.raise_for_status() - transcription_data = response.json() - - self._save_segments_to_db(transcription_data, meeting_id) - - except requests.exceptions.RequestException as e: - print(f"Error fetching transcription from URL: {e}") - except json.JSONDecodeError as e: - print(f"Error decoding JSON from transcription URL: {e}") - except Exception as e: - print(f"An unexpected error occurred: {e}") - - def _save_segments_to_db(self, data: dict, meeting_id: int): - segments_to_insert = [] - for transcript in data.get('transcripts', []): - for sentence in transcript.get('sentences', []): - speaker_id = sentence.get('speaker_id', -1) - segments_to_insert.append(( - meeting_id, - speaker_id, # For the new speaker_id column - speaker_id, # For the speaker_tag column (initial value) - sentence.get('begin_time'), - sentence.get('end_time'), - sentence.get('text') - )) - - if not segments_to_insert: - print("No segments to save.") - return - - try: - with get_db_connection() as connection: - cursor = connection.cursor() - - # Clear existing segments for this meeting to avoid duplicates - delete_query = "DELETE FROM transcript_segments WHERE meeting_id = %s" - cursor.execute(delete_query, (meeting_id,)) - print(f"Deleted existing segments for meeting_id: {meeting_id}") - - insert_query = ''' - INSERT INTO transcript_segments (meeting_id, speaker_id, speaker_tag, start_time_ms, end_time_ms, text_content) - VALUES (%s, %s, %s, %s, %s, %s) - ''' - cursor.executemany(insert_query, segments_to_insert) - connection.commit() - print(f"Successfully saved {len(segments_to_insert)} segments to the database for meeting_id: {meeting_id}") - - except Exception as e: - print(f"Database error: {e}") - -# Main method for testing -if __name__ == '__main__': - # This is an example of how to use the service. - # You need to provide a valid meeting_id that exists in your database - # and a publicly accessible URL for the audio file. - - # Example usage: - # 1. Make sure you have a meeting with meeting_id = 1 in your database. - # 2. Make sure the audio file URL is correct and accessible. - - test_meeting_id = 40 - # Please replace with your own publicly accessible audio file URL - test_file_urls = ['http://t0vogyxkz.hn-bkt.clouddn.com/test/dajiang.m4a'] - - print("--- Running Voice Service Test ---") - voice_service = VoiceService() - voice_service.transcribe(file_urls=test_file_urls, meeting_id=test_meeting_id) - print("--- Voice Service Test Finished ---") \ No newline at end of file diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index cc19d27..e97f799 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -6,6 +6,7 @@ services: ports: - "8001:8001" environment: + - TZ=Asia/Shanghai # Python运行环境 - PYTHONPATH=/app - PYTHONUNBUFFERED=1