diff --git a/apps/knowledge/serializers/document.py b/apps/knowledge/serializers/document.py index 7e9fa4237..0d63c9665 100644 --- a/apps/knowledge/serializers/document.py +++ b/apps/knowledge/serializers/document.py @@ -103,6 +103,11 @@ class BatchCancelInstanceSerializer(serializers.Serializer): raise AppApiException(500, _('task type not support')) +class BatchDocumentExportSerializer(serializers.Serializer): + document_id_list = serializers.ListField(required=True, allow_empty=False, child=serializers.UUIDField(required=True), + label=_('document id list')) + + class DocumentInstanceSerializer(serializers.Serializer): name = serializers.CharField(required=True, label=_('document name'), max_length=128, min_length=1, source=_('document name')) @@ -560,19 +565,25 @@ class DocumentSerializers(serializers.Serializer): if not query_set.exists(): raise AppApiException(500, _('Knowledge id does not exist')) document_id = self.data.get('document_id') - if not QuerySet(Document).filter(id=document_id).exists(): + if not QuerySet(Document).filter(id=document_id, knowledge_id=self.data.get('knowledge_id')).exists(): raise AppApiException(500, _('document id not exist')) def export(self, with_valid=True): if with_valid: self.is_valid(raise_exception=True) - document = QuerySet(Document).filter(id=self.data.get("document_id")).first() - paragraph_list = native_search(QuerySet(Paragraph).filter(document_id=self.data.get("document_id")), + document = QuerySet(Document).filter( + id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id") + ).first() + paragraph_list = native_search(QuerySet(Paragraph).filter( + document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id") + ), get_file_content( os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_paragraph_document_name.sql'))) problem_mapping_list = native_search( - QuerySet(ProblemParagraphMapping).filter(document_id=self.data.get("document_id")), get_file_content( + QuerySet(ProblemParagraphMapping).filter( + document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id") + ), get_file_content( os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_problem_mapping.sql')), with_table_name=True) data_dict, document_dict = self.merge_problem(paragraph_list, problem_mapping_list, [document]) @@ -585,13 +596,19 @@ class DocumentSerializers(serializers.Serializer): def export_zip(self, with_valid=True): if with_valid: self.is_valid(raise_exception=True) - document = QuerySet(Document).filter(id=self.data.get("document_id")).first() - paragraph_list = native_search(QuerySet(Paragraph).filter(document_id=self.data.get("document_id")), + document = QuerySet(Document).filter( + id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id") + ).first() + paragraph_list = native_search(QuerySet(Paragraph).filter( + document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id") + ), get_file_content( os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_paragraph_document_name.sql'))) problem_mapping_list = native_search( - QuerySet(ProblemParagraphMapping).filter(document_id=self.data.get("document_id")), get_file_content( + QuerySet(ProblemParagraphMapping).filter( + document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id") + ), get_file_content( os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_problem_mapping.sql')), with_table_name=True) data_dict, document_dict = self.merge_problem(paragraph_list, problem_mapping_list, [document]) @@ -612,7 +629,20 @@ class DocumentSerializers(serializers.Serializer): def download_source_file(self): self.is_valid(raise_exception=True) - file = QuerySet(File).filter(source_id=self.data.get('document_id')).first() + document = QuerySet(Document).filter( + id=self.data.get('document_id'), knowledge_id=self.data.get('knowledge_id') + ).first() + source_file_id = document.meta.get('source_file_id') if document and document.meta else None + file_query_set = QuerySet(File).filter(source_type=FileSourceType.DOCUMENT, source_id=self.data.get('document_id')) + file = file_query_set.filter(id=source_file_id).first() if source_file_id else None + if not file: + file = file_query_set.first() + if not file and source_file_id: + file = QuerySet(File).filter( + id=source_file_id, + source_type=FileSourceType.KNOWLEDGE, + source_id=self.data.get('knowledge_id') + ).first() if not file: raise AppApiException(500, _('File not exist. Only manually uploaded documents are supported')) return FileSerializer.Operate(data={'id': file.id}).get(with_valid=True) @@ -620,7 +650,10 @@ class DocumentSerializers(serializers.Serializer): def one(self, with_valid=False): self.is_valid(raise_exception=True) query_set = QuerySet(model=Document) - query_set = query_set.filter(**{'id': self.data.get("document_id")}) + query_set = query_set.filter(**{ + 'id': self.data.get("document_id"), + 'knowledge_id': self.data.get("knowledge_id") + }) return native_search({ 'document_custom_sql': query_set, 'order_by_query': QuerySet(Document).order_by('-create_time', 'id') @@ -1034,6 +1067,26 @@ class DocumentSerializers(serializers.Serializer): file.source_id = self.data.get('knowledge_id') file.save(file_bytes) + def normalize_split_result(self, result, file, split_handle, file_id): + result_list = result if isinstance(result, list) else [result] + if len(result_list) == 0: + result_list = [{'content': []}] + for index, item in enumerate(result_list): + if item is None: + item = {'content': []} + result_list[index] = item + content = item.get('content') + if not content and hasattr(split_handle, 'get_content'): + try: + file.seek(0) + raw_content = split_handle.get_content(file, self.save_image) + if raw_content: + item['content'] = [{'title': '', 'content': raw_content}] + except Exception as e: + maxkb_logger.warning(f'Fallback split content failed: {e}') + item['source_file_id'] = file_id + return result_list + def file_to_paragraph(self, file, pattern_list: List, with_filter: bool, limit: int): # 保存源文件 file_id = uuid.uuid7() @@ -1051,19 +1104,9 @@ class DocumentSerializers(serializers.Serializer): for split_handle in split_handles: if split_handle.support(file, get_buffer): result = split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image) - if isinstance(result, list): - for item in result: - item['source_file_id'] = file_id - return result - result['source_file_id'] = file_id - return [result] + return self.normalize_split_result(result, file, split_handle, file_id) result = default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image) - if isinstance(result, list): - for item in result: - item['source_file_id'] = file_id - return result - result['source_file_id'] = file_id - return [result] + return self.normalize_split_result(result, file, default_split_handle, file_id) class SplitPattern(serializers.Serializer): workspace_id = serializers.CharField(required=False, label=_('workspace id'), allow_null=True) @@ -1078,6 +1121,9 @@ class DocumentSerializers(serializers.Serializer): {'key': '####', 'value': "(?<=\\n)(?|]', '_', document.name or str(document.id)).strip() or str(document.id) + used_names[safe_name] += 1 + file_name = safe_name if used_names[safe_name] == 1 else f'{safe_name}_{used_names[safe_name]}' + workbook.save(os.path.join(tempdir, f'{file_name}.xlsx')) + zip_dir(tempdir, zip_buffer) + response.write(zip_buffer.getvalue()) + return response + @transaction.atomic def batch_delete(self, instance: Dict, with_valid=True): if with_valid: @@ -1539,18 +1623,28 @@ class DocumentSerializers(serializers.Serializer): def replace(self): self.is_valid(raise_exception=True) file = self.data.get('file') - source_file = QuerySet(File).filter(source_id=self.data.get('document_id')).first() + document = QuerySet(Document).filter( + id=self.data.get('document_id'), + knowledge_id=self.data.get('knowledge_id') + ).first() + source_file = QuerySet(File).filter( + source_type=FileSourceType.DOCUMENT, + source_id=self.data.get('document_id') + ).first() + file_content = file.read() if not source_file: + new_source_file_id = uuid.uuid7() # 不存在手动关联一个文档 new_source_file_id = uuid.uuid7() new_source_file = File( id=new_source_file_id, file_name=file.name, + file_size=file.size, source_type=FileSourceType.DOCUMENT, source_id=self.data.get('document_id'), ) - new_source_file.save(file.read()) + new_source_file.save(file_content) # 更新Document的meta字段 QuerySet(Document).filter(id=self.data.get('document_id')).update( meta=Func( @@ -1563,21 +1657,44 @@ class DocumentSerializers(serializers.Serializer): ) ) else: - # 获取原文件的sha256_hash - original_hash = source_file.sha256_hash + source_file.file_name = file.name + source_file.file_size = file.size + source_file.save(file_content) - # 读取新文件内容 - file_content = file.read() + file.seek(0) + parsed_documents = DocumentSerializers.Split( + data={'workspace_id': self.data.get('workspace_id'), 'knowledge_id': self.data.get('knowledge_id')} + ).file_to_paragraph(file, None, None, 4096) + paragraphs = [] + for parsed_document in parsed_documents: + paragraphs.extend(parsed_document.get('content', [])) - # 查找所有具有相同sha256_hash的文件 - files_to_update = QuerySet(File).filter( - sha256_hash=original_hash, - source_id__in=[self.data.get('knowledge_id'), self.data.get('document_id')] - ) + paragraph_ids = QuerySet(Paragraph).filter(document_id=document.id).values_list("id", flat=True) + delete_problems_and_mappings(paragraph_ids) + QuerySet(Paragraph).filter(document_id=document.id).delete() + delete_embedding_by_document(document.id) - # 更新所有相同hash的文件 - for file_obj in files_to_update: - file_obj.save(file_content) + document.char_length = reduce(lambda x, y: x + y, [len(p.get('content', '')) for p in paragraphs], 0) + document.name = file.name[:150] + document.save() + + document_paragraph_model = DocumentSerializers.Create.get_paragraph_model(document, paragraphs) + paragraph_model_list = document_paragraph_model.get('paragraph_model_list') + problem_paragraph_object_list = document_paragraph_model.get('problem_paragraph_object_list') + problem_model_list, problem_paragraph_mapping_list = ProblemParagraphManage( + problem_paragraph_object_list, document.knowledge_id + ).to_problem_model_list() + if len(paragraph_model_list) > 0: + for i, paragraph in enumerate(paragraph_model_list): + paragraph.position = i + 1 + QuerySet(Paragraph).bulk_create(paragraph_model_list) + bulk_create_in_batches(Problem, problem_model_list, batch_size=1000) + bulk_create_in_batches(ProblemParagraphMapping, problem_paragraph_mapping_list, batch_size=1000) + DocumentSerializers.Operate(data={ + 'workspace_id': self.data.get('workspace_id'), + 'knowledge_id': self.data.get('knowledge_id'), + 'document_id': self.data.get('document_id') + }).refresh(with_valid=True) return True diff --git a/apps/knowledge/urls.py b/apps/knowledge/urls.py index 3b41be1e9..c2cfb2285 100644 --- a/apps/knowledge/urls.py +++ b/apps/knowledge/urls.py @@ -31,6 +31,7 @@ urlpatterns = [ path('workspace//knowledge//document/batch_sync', views.DocumentView.BatchSync.as_view()), path('workspace//knowledge//document/batch_delete', views.DocumentView.BatchDelete.as_view()), path('workspace//knowledge//document/batch_refresh', views.DocumentView.BatchRefresh.as_view()), + path('workspace//knowledge//document/batch_export_zip', views.DocumentView.BatchExportZip.as_view()), path('workspace//knowledge//document/batch_generate_related', views.DocumentView.BatchGenerateRelated.as_view()), path('workspace//knowledge//document/web', views.WebDocumentView.as_view()), path('workspace//knowledge//document/qa', views.QaDocumentView.as_view()), diff --git a/apps/knowledge/views/document.py b/apps/knowledge/views/document.py index 1b3301d69..3c5afa5f3 100644 --- a/apps/knowledge/views/document.py +++ b/apps/knowledge/views/document.py @@ -665,6 +665,29 @@ class DocumentView(APIView): 'workspace_id': workspace_id, 'document_id': document_id, 'knowledge_id': knowledge_id }).export_zip() + class BatchExportZip(APIView): + authentication_classes = [TokenAuth] + + @has_permissions( + PermissionConstants.KNOWLEDGE_DOCUMENT_EXPORT.get_workspace_knowledge_permission(), + PermissionConstants.KNOWLEDGE_DOCUMENT_EXPORT.get_workspace_permission_workspace_manage_role(), + RoleConstants.WORKSPACE_MANAGE.get_workspace_role(), + ViewPermission([RoleConstants.USER.get_workspace_role()], + [PermissionConstants.KNOWLEDGE.get_workspace_knowledge_permission()], CompareConstants.AND), + ) + @log( + menu='document', operate="Batch export documents", + get_operation_object=lambda r, keywords: get_knowledge_document_operation_object( + get_knowledge_operation_object(keywords.get('knowledge_id')), + get_document_operation_object_batch(r.data.get('document_id_list')) + ), + ) + def post(self, request: Request, workspace_id: str, knowledge_id: str): + return DocumentSerializers.Batch(data={ + 'workspace_id': workspace_id, + 'knowledge_id': knowledge_id + }).batch_export_zip(request.data) + class DownloadSourceFile(APIView): authentication_classes = [TokenAuth] diff --git a/ui/src/api/knowledge/document.ts b/ui/src/api/knowledge/document.ts index 86587b8b1..57d3d9647 100644 --- a/ui/src/api/knowledge/document.ts +++ b/ui/src/api/knowledge/document.ts @@ -1,5 +1,5 @@ import { Result } from '@/request/Result' -import { del, exportExcel, exportFile, get, post, put } from '@/request/index' +import { del, exportExcel, exportExcelPost, exportFile, get, post, put } from '@/request/index' import type { Ref } from 'vue' import type { KeyValue, pageRequest } from '@/api/type/common' @@ -195,6 +195,21 @@ const exportDocumentZip: ( ) } +const exportMulDocumentZip: ( + document_name: string, + knowledge_id: string, + document_id_list: string[], + loading?: Ref, +) => Promise = (document_name, knowledge_id, document_id_list, loading) => { + return exportExcelPost( + document_name.trim() + '.zip', + `${prefix.value}/${knowledge_id}/document/batch_export_zip`, + {}, + { document_id_list }, + loading, + ) +} + /** * 刷新文档向量库 * @param 参数 @@ -619,6 +634,7 @@ export default { postReplaceSourceFile, exportDocument, exportDocumentZip, + exportMulDocumentZip, putDocumentRefresh, putDocumentSync, putMulDocument, diff --git a/ui/src/components/resource-authorization-drawer/index.vue b/ui/src/components/resource-authorization-drawer/index.vue index 0eb8e76e1..926550ecb 100644 --- a/ui/src/components/resource-authorization-drawer/index.vue +++ b/ui/src/components/resource-authorization-drawer/index.vue @@ -128,7 +128,7 @@ @close="closeSingleSelectDialog" width="500px" > - +

{{ $t('views.system.resourceAuthorization.setting.currentOnly') }} @@ -140,6 +140,9 @@

+

+ {{ $t('common.confirm') }} +