From 56a9e69912a2e987d93422e74c040530d65e65e6 Mon Sep 17 00:00:00 2001 From: CaptainB Date: Tue, 3 Jun 2025 20:01:11 +0800 Subject: [PATCH] feat: add endpoint to download source files with updated parameter handling --- apps/common/constants/permission_constants.py | 5 ++ apps/knowledge/api/document.py | 32 ++++++++ apps/knowledge/models/knowledge.py | 4 + apps/knowledge/serializers/document.py | 78 ++++++++++++++++--- apps/knowledge/urls.py | 1 + apps/knowledge/views/document.py | 18 ++++- 6 files changed, 125 insertions(+), 13 deletions(-) diff --git a/apps/common/constants/permission_constants.py b/apps/common/constants/permission_constants.py index 42bc609a8..f97802934 100644 --- a/apps/common/constants/permission_constants.py +++ b/apps/common/constants/permission_constants.py @@ -393,6 +393,11 @@ class PermissionConstants(Enum): role_list=[RoleConstants.ADMIN, RoleConstants.USER], parent_group=[WorkspaceGroup.KNOWLEDGE, UserGroup.KNOWLEDGE] ) + KNOWLEDGE_DOCUMENT_DOWNLOAD_RAW = Permission( + group=Group.KNOWLEDGE_DOCUMENT, operate=Operate.EXPORT, + role_list=[RoleConstants.ADMIN, RoleConstants.USER], + parent_group=[WorkspaceGroup.KNOWLEDGE, UserGroup.KNOWLEDGE] + ) KNOWLEDGE_DOCUMENT_GENERATE = Permission( group=Group.KNOWLEDGE_DOCUMENT, operate=Operate.GENERATE, role_list=[RoleConstants.ADMIN, RoleConstants.USER], diff --git a/apps/knowledge/api/document.py b/apps/knowledge/api/document.py index 8c2c3049e..6c506ccb1 100644 --- a/apps/knowledge/api/document.py +++ b/apps/knowledge/api/document.py @@ -503,3 +503,35 @@ class DocumentMigrateAPI(APIMixin): @staticmethod def get_request(): return DocumentMigrateSerializer + + +class DocumentDownloadSourceAPI(APIMixin): + @staticmethod + def get_parameters(): + return [ + OpenApiParameter( + name="workspace_id", + description="工作空间id", + type=OpenApiTypes.STR, + location='path', + required=True, + ), + OpenApiParameter( + name="knowledge_id", + description="知识库id", + type=OpenApiTypes.STR, + location='path', + required=True, + ), + OpenApiParameter( + name="document_id", + description="文档id", + type=OpenApiTypes.STR, + location='path', + required=True, + ), + ] + + @staticmethod + def get_response(): + return DefaultResultSerializer diff --git a/apps/knowledge/models/knowledge.py b/apps/knowledge/models/knowledge.py index e8b600c0d..28ab0abc7 100644 --- a/apps/knowledge/models/knowledge.py +++ b/apps/knowledge/models/knowledge.py @@ -220,6 +220,10 @@ class FileSourceType(models.TextChoices): KNOWLEDGE = "KNOWLEDGE" # 应用 跟随应用被删除而被删除 source_id 为应用id APPLICATION = "APPLICATION" + # 工具 跟随工具被删除而被删除 source_id 为应用id + TOOL = "TOOL" + # 文档 + DOCUMENT = "DOCUMENT" # 临时30分钟 数据30分钟后被清理 source_id 为TEMPORARY_30_MINUTE TEMPORARY_30_MINUTE = "TEMPORARY_30_MINUTE" # 临时120分钟 数据120分钟后被清理 source_id为TEMPORARY_100_MINUTE diff --git a/apps/knowledge/serializers/document.py b/apps/knowledge/serializers/document.py index 5369c91bc..fb08797a4 100644 --- a/apps/knowledge/serializers/document.py +++ b/apps/knowledge/serializers/document.py @@ -12,7 +12,7 @@ import uuid_utils.compat as uuid from celery_once import AlreadyQueued from django.core import validators from django.db import transaction, models -from django.db.models import QuerySet, Model +from django.db.models import QuerySet from django.db.models.functions import Substr, Reverse from django.http import HttpResponse from django.utils.translation import gettext_lazy as _, gettext, get_language, to_locale @@ -43,7 +43,7 @@ from common.utils.common import post, get_file_content, bulk_create_in_batches, from common.utils.fork import Fork from common.utils.split_model import get_split_model, flat_map from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \ - TaskType, File + TaskType, File, FileSourceType from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer, \ get_embedding_model_id_by_knowledge_id, MetaSerializer, write_image, zip_dir from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \ @@ -54,6 +54,7 @@ from knowledge.task.embedding import embedding_by_document, delete_embedding_by_ from knowledge.task.generate import generate_related_by_document_id from knowledge.task.sync import sync_web_document from maxkb.const import PROJECT_DIR +from models_provider.models import Model default_split_handle = TextSplitHandle() split_handles = [ @@ -87,6 +88,7 @@ class BatchCancelInstanceSerializer(serializers.Serializer): class DocumentInstanceSerializer(serializers.Serializer): name = serializers.CharField(required=True, label=_('document name'), max_length=128, min_length=1) paragraphs = ParagraphInstanceSerializer(required=False, many=True, allow_null=True) + source_file_id = serializers.UUIDField(required=True, label=_('source file id')) class CancelInstanceSerializer(serializers.Serializer): @@ -545,6 +547,9 @@ class DocumentSerializers(serializers.Serializer): response.write(zip_buffer.getvalue()) return response + def download_source_file(self): + pass + def one(self, with_valid=False): if with_valid: self.is_valid(raise_exception=True) @@ -626,8 +631,6 @@ class DocumentSerializers(serializers.Serializer): embedding_model = QuerySet(Model).filter(id=embedding_model_id).first() if embedding_model is None: raise AppApiException(500, _('Model does not exist')) - if embedding_model.permission_type == 'PRIVATE' and knowledge_user_id != embedding_model.user_id: - raise AppApiException(500, _('No permission to use this model') + f"{embedding_model.name}") document_id = self.data.get("document_id") ListenerManagement.update_status( QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING, State.PENDING @@ -859,6 +862,8 @@ class DocumentSerializers(serializers.Serializer): for file in save_image_list: file_bytes = file.meta.pop('content') file.meta['knowledge_id'] = self.data.get('knowledge_id') + file.source_type = FileSourceType.KNOWLEDGE + file.source_id = self.data.get('knowledge_id') file.save(file_bytes) class Split(serializers.Serializer): @@ -901,19 +906,39 @@ class DocumentSerializers(serializers.Serializer): for file in save_image_list: file_bytes = file.meta.pop('content') file.meta['knowledge_id'] = self.data.get('knowledge_id') + file.source_type = FileSourceType.KNOWLEDGE + file.source_id = self.data.get('knowledge_id') file.save(file_bytes) def file_to_paragraph(self, file, pattern_list: List, with_filter: bool, limit: int): + # 保存源文件 + file_id = uuid.uuid7() + raw_file = File( + id=file_id, + file_name=file.name, + file_size=file.size, + source_type=FileSourceType.KNOWLEDGE, + source_id=self.data.get('knowledge_id'), + ) + raw_file.save(file.read()) + file.seek(0) + get_buffer = FileBufferHandle().get_buffer for split_handle in split_handles: if split_handle.support(file, get_buffer): result = split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image) if isinstance(result, list): + for item in result: + item['source_file_id'] = file_id return result + result['source_file_id'] = file_id return [result] result = default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image) if isinstance(result, list): + for item in result: + item['source_file_id'] = file_id return result + result['source_file_id'] = file_id return [result] class SplitPattern(serializers.Serializer): @@ -937,14 +962,37 @@ class DocumentSerializers(serializers.Serializer): ] class Batch(serializers.Serializer): - workspace_id = serializers.UUIDField(required=True, label=_('workspace id')) + workspace_id = serializers.CharField(required=True, label=_('workspace id')) knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id')) @staticmethod - def post_embedding(document_list, knowledge_id): + def link_file(source_file_id, document_id): + source_file = QuerySet(File).filter(id=source_file_id).first() + if source_file: + # 获取原始文件内容 + file_content = source_file.get_bytes() + + # 创建新文件对象,复制原始文件的重要属性 + new_file = File( + id=uuid.uuid7(), + file_name=source_file.file_name, + file_size=source_file.file_size, + source_type=FileSourceType.DOCUMENT, + source_id=document_id, # 更新为当前知识库ID + meta=source_file.meta.copy() if source_file.meta else {} + ) + + # 保存文件内容和元数据 + new_file.save(file_content) + + @staticmethod + def post_embedding(document_list, knowledge_id, workspace_id): for document_dict in document_list: - DocumentSerializers.Operate( - data={'knowledge_id': knowledge_id, 'document_id': document_dict.get('id')}).refresh() + DocumentSerializers.Operate(data={ + 'knowledge_id': knowledge_id, + 'document_id': document_dict.get('id'), + 'workspace_id': workspace_id + }).refresh() return document_list @post(post_function=post_embedding) @@ -953,15 +1001,21 @@ class DocumentSerializers(serializers.Serializer): if with_valid: self.is_valid(raise_exception=True) DocumentInstanceSerializer(many=True, data=instance_list).is_valid(raise_exception=True) + workspace_id = self.data.get("workspace_id") knowledge_id = self.data.get("knowledge_id") document_model_list = [] paragraph_model_list = [] problem_paragraph_object_list = [] # 插入文档 for document in instance_list: - document_paragraph_dict_model = DocumentSerializers.Create.get_document_paragraph_model(knowledge_id, - document) - document_model_list.append(document_paragraph_dict_model.get('document')) + document_paragraph_dict_model = DocumentSerializers.Create.get_document_paragraph_model( + knowledge_id, + document + ) + # 保存文档和文件的关系 + document_instance = document_paragraph_dict_model.get('document') + self.link_file(document['source_file_id'], document_instance.id) + document_model_list.append(document_instance) for paragraph in document_paragraph_dict_model.get('paragraph_model_list'): paragraph_model_list.append(paragraph) for problem_paragraph_object in document_paragraph_dict_model.get('problem_paragraph_object_list'): @@ -992,7 +1046,7 @@ class DocumentSerializers(serializers.Serializer): os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_document.sql') ), with_search_one=False - ), knowledge_id + ), knowledge_id, workspace_id @staticmethod def _batch_sync(document_id_list: List[str]): diff --git a/apps/knowledge/urls.py b/apps/knowledge/urls.py index d20e1ae2f..de06faad9 100644 --- a/apps/knowledge/urls.py +++ b/apps/knowledge/urls.py @@ -38,6 +38,7 @@ urlpatterns = [ path('workspace//knowledge//document//batch_cancel_task', views.DocumentView.BatchCancelTask.as_view()), path('workspace//knowledge//document//export', views.DocumentView.Export.as_view()), path('workspace//knowledge//document//export_zip', views.DocumentView.ExportZip.as_view()), + path('workspace//knowledge//document//download_source_file', views.DocumentView.DownloadSourceFile.as_view()), path('workspace//knowledge//document//paragraph', views.ParagraphView.as_view()), path('workspace//knowledge//document//paragraph/batch_delete', views.ParagraphView.BatchDelete.as_view()), path('workspace//knowledge//document//paragraph/batch_generate_related', views.ParagraphView.BatchGenerateRelated.as_view()), diff --git a/apps/knowledge/views/document.py b/apps/knowledge/views/document.py index 846a6a9ee..524a9d0a6 100644 --- a/apps/knowledge/views/document.py +++ b/apps/knowledge/views/document.py @@ -12,7 +12,7 @@ from knowledge.api.document import DocumentSplitAPI, DocumentBatchAPI, DocumentB DocumentReadAPI, DocumentEditAPI, DocumentDeleteAPI, TableDocumentCreateAPI, QaDocumentCreateAPI, \ WebDocumentCreateAPI, CancelTaskAPI, BatchCancelTaskAPI, SyncWebAPI, RefreshAPI, BatchEditHitHandlingAPI, \ DocumentTreeReadAPI, DocumentSplitPatternAPI, BatchRefreshAPI, BatchGenerateRelatedAPI, TemplateExportAPI, \ - DocumentExportAPI, DocumentMigrateAPI + DocumentExportAPI, DocumentMigrateAPI, DocumentDownloadSourceAPI from knowledge.serializers.document import DocumentSerializers @@ -417,6 +417,22 @@ class DocumentView(APIView): 'workspace_id': workspace_id, 'document_id': document_id, 'knowledge_id': knowledge_id }).export_zip() + class DownloadSourceFile(APIView): + authentication_classes = [TokenAuth] + + @extend_schema( + summary=_('Download source file'), + operation_id=_('Download source file'), # type: ignore + parameters=DocumentDownloadSourceAPI.get_parameters(), + responses=DocumentDownloadSourceAPI.get_response(), + tags=[_('Knowledge Base/Documentation')] # type: ignore + ) + @has_permissions(PermissionConstants.KNOWLEDGE_DOCUMENT_DOWNLOAD_RAW.get_workspace_permission()) + def get(self, request: Request, workspace_id: str, knowledge_id: str, document_id: str): + return DocumentSerializers.Operate(data={ + 'workspace_id': workspace_id, 'document_id': document_id, 'knowledge_id': knowledge_id + }).download_source_file() + class Migrate(APIView): authentication_classes = [TokenAuth]