fix: 修复知识库的1、3、5、6、7、8、10

2026-06-11 11:25:45 +08:00 · 2026-06-11 11:25:45 +08:00 · 9bddff2914
parent 29d3110799
commit 9bddff2914
6 changed files with 218 additions and 49 deletions
--- a/apps/knowledge/serializers/document.py
+++ b/apps/knowledge/serializers/document.py
@ -103,6 +103,11 @@ class BatchCancelInstanceSerializer(serializers.Serializer):
            raise AppApiException(500, _('task type not support'))


+class BatchDocumentExportSerializer(serializers.Serializer):
+    document_id_list = serializers.ListField(required=True, allow_empty=False, child=serializers.UUIDField(required=True),
+                                             label=_('document id list'))
+
+
 class DocumentInstanceSerializer(serializers.Serializer):
    name = serializers.CharField(required=True, label=_('document name'), max_length=128, min_length=1,
                                 source=_('document name'))
@ -560,19 +565,25 @@ class DocumentSerializers(serializers.Serializer):
            if not query_set.exists():
                raise AppApiException(500, _('Knowledge id does not exist'))
            document_id = self.data.get('document_id')
-            if not QuerySet(Document).filter(id=document_id).exists():
+            if not QuerySet(Document).filter(id=document_id, knowledge_id=self.data.get('knowledge_id')).exists():
                raise AppApiException(500, _('document id not exist'))

        def export(self, with_valid=True):
            if with_valid:
                self.is_valid(raise_exception=True)
-            document = QuerySet(Document).filter(id=self.data.get("document_id")).first()
-            paragraph_list = native_search(QuerySet(Paragraph).filter(document_id=self.data.get("document_id")),
+            document = QuerySet(Document).filter(
+                id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
+            ).first()
+            paragraph_list = native_search(QuerySet(Paragraph).filter(
+                document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
+            ),
                                           get_file_content(
                                               os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
                                                            'list_paragraph_document_name.sql')))
            problem_mapping_list = native_search(
-                QuerySet(ProblemParagraphMapping).filter(document_id=self.data.get("document_id")), get_file_content(
+                QuerySet(ProblemParagraphMapping).filter(
+                    document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
+                ), get_file_content(
                    os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_problem_mapping.sql')),
                with_table_name=True)
            data_dict, document_dict = self.merge_problem(paragraph_list, problem_mapping_list, [document])
@ -585,13 +596,19 @@ class DocumentSerializers(serializers.Serializer):
        def export_zip(self, with_valid=True):
            if with_valid:
                self.is_valid(raise_exception=True)
-            document = QuerySet(Document).filter(id=self.data.get("document_id")).first()
-            paragraph_list = native_search(QuerySet(Paragraph).filter(document_id=self.data.get("document_id")),
+            document = QuerySet(Document).filter(
+                id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
+            ).first()
+            paragraph_list = native_search(QuerySet(Paragraph).filter(
+                document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
+            ),
                                           get_file_content(
                                               os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
                                                            'list_paragraph_document_name.sql')))
            problem_mapping_list = native_search(
-                QuerySet(ProblemParagraphMapping).filter(document_id=self.data.get("document_id")), get_file_content(
+                QuerySet(ProblemParagraphMapping).filter(
+                    document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
+                ), get_file_content(
                    os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_problem_mapping.sql')),
                with_table_name=True)
            data_dict, document_dict = self.merge_problem(paragraph_list, problem_mapping_list, [document])
@ -612,7 +629,20 @@ class DocumentSerializers(serializers.Serializer):

        def download_source_file(self):
            self.is_valid(raise_exception=True)
-            file = QuerySet(File).filter(source_id=self.data.get('document_id')).first()
+            document = QuerySet(Document).filter(
+                id=self.data.get('document_id'), knowledge_id=self.data.get('knowledge_id')
+            ).first()
+            source_file_id = document.meta.get('source_file_id') if document and document.meta else None
+            file_query_set = QuerySet(File).filter(source_type=FileSourceType.DOCUMENT, source_id=self.data.get('document_id'))
+            file = file_query_set.filter(id=source_file_id).first() if source_file_id else None
+            if not file:
+                file = file_query_set.first()
+            if not file and source_file_id:
+                file = QuerySet(File).filter(
+                    id=source_file_id,
+                    source_type=FileSourceType.KNOWLEDGE,
+                    source_id=self.data.get('knowledge_id')
+                ).first()
            if not file:
                raise AppApiException(500, _('File not exist. Only manually uploaded documents are supported'))
            return FileSerializer.Operate(data={'id': file.id}).get(with_valid=True)
@ -620,7 +650,10 @@ class DocumentSerializers(serializers.Serializer):
        def one(self, with_valid=False):
            self.is_valid(raise_exception=True)
            query_set = QuerySet(model=Document)
-            query_set = query_set.filter(**{'id': self.data.get("document_id")})
+            query_set = query_set.filter(**{
+                'id': self.data.get("document_id"),
+                'knowledge_id': self.data.get("knowledge_id")
+            })
            return native_search({
                'document_custom_sql': query_set,
                'order_by_query': QuerySet(Document).order_by('-create_time', 'id')
@ -1034,6 +1067,26 @@ class DocumentSerializers(serializers.Serializer):
                    file.source_id = self.data.get('knowledge_id')
                    file.save(file_bytes)

+        def normalize_split_result(self, result, file, split_handle, file_id):
+            result_list = result if isinstance(result, list) else [result]
+            if len(result_list) == 0:
+                result_list = [{'content': []}]
+            for index, item in enumerate(result_list):
+                if item is None:
+                    item = {'content': []}
+                    result_list[index] = item
+                content = item.get('content')
+                if not content and hasattr(split_handle, 'get_content'):
+                    try:
+                        file.seek(0)
+                        raw_content = split_handle.get_content(file, self.save_image)
+                        if raw_content:
+                            item['content'] = [{'title': '', 'content': raw_content}]
+                    except Exception as e:
+                        maxkb_logger.warning(f'Fallback split content failed: {e}')
+                item['source_file_id'] = file_id
+            return result_list
+
        def file_to_paragraph(self, file, pattern_list: List, with_filter: bool, limit: int):
            # 保存源文件
            file_id = uuid.uuid7()
@ -1051,19 +1104,9 @@ class DocumentSerializers(serializers.Serializer):
            for split_handle in split_handles:
                if split_handle.support(file, get_buffer):
                    result = split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
-                    if isinstance(result, list):
-                        for item in result:
-                            item['source_file_id'] = file_id
-                        return result
-                    result['source_file_id'] = file_id
-                    return [result]
+                    return self.normalize_split_result(result, file, split_handle, file_id)
            result = default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
-            if isinstance(result, list):
-                for item in result:
-                    item['source_file_id'] = file_id
-                return result
-            result['source_file_id'] = file_id
-            return [result]
+            return self.normalize_split_result(result, file, default_split_handle, file_id)

    class SplitPattern(serializers.Serializer):
        workspace_id = serializers.CharField(required=False, label=_('workspace id'), allow_null=True)
@ -1078,6 +1121,9 @@ class DocumentSerializers(serializers.Serializer):
                {'key': '####', 'value': "(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"},
                {'key': '#####', 'value': "(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"},
                {'key': '######', 'value': "(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"},
+                {'key': _('chapter'), 'value': '(?<=^)第[一二三四五六七八九十百千万0-9]+[章节篇部分].*|(?<=\\n)第[一二三四五六七八九十百千万0-9]+[章节篇部分].*'},
+                {'key': _('numbered heading'), 'value': '(?<=^)[0-9]+(\\.[0-9]+)*[、.．]\\s*.*|(?<=\\n)[0-9]+(\\.[0-9]+)*[、.．]\\s*.*'},
+                {'key': _('chinese numbered heading'), 'value': '(?<=^)[一二三四五六七八九十]+[、.．]\\s*.*|(?<=\\n)[一二三四五六七八九十]+[、.．]\\s*.*'},
                {'key': '-', 'value': '(?<! )- .*'},
                {'key': _('space'), 'value': '(?<! ) (?! )'},
                {'key': _('semicolon'), 'value': '(?<!；)；(?!；)'}, {'key': _('comma'), 'value': '(?<!，)，(?!，)'},
@ -1208,6 +1254,44 @@ class DocumentSerializers(serializers.Serializer):
            )
            return True

+        def batch_export_zip(self, instance: Dict, with_valid=True):
+            if with_valid:
+                self.is_valid(raise_exception=True)
+                BatchDocumentExportSerializer(data=instance).is_valid(raise_exception=True)
+            knowledge_id = self.data.get('knowledge_id')
+            document_id_list = instance.get('document_id_list')
+            document_list = list(QuerySet(Document).filter(knowledge_id=knowledge_id, id__in=document_id_list))
+            if len(document_list) != len(set([str(document_id) for document_id in document_id_list])):
+                raise AppApiException(500, _('Document id does not exist'))
+
+            response = HttpResponse(content_type='application/zip')
+            response['Content-Disposition'] = 'attachment; filename="documents.zip"'
+            zip_buffer = io.BytesIO()
+            with TemporaryDirectory() as tempdir:
+                used_names = defaultdict(int)
+                for document in document_list:
+                    paragraph_list = native_search(QuerySet(Paragraph).filter(
+                        knowledge_id=knowledge_id, document_id=document.id
+                    ), get_file_content(
+                        os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_paragraph_document_name.sql')
+                    ))
+                    problem_mapping_list = native_search(
+                        QuerySet(ProblemParagraphMapping).filter(knowledge_id=knowledge_id, document_id=document.id),
+                        get_file_content(os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
+                                                      'list_problem_mapping.sql')),
+                        with_table_name=True)
+                    data_dict, document_dict = DocumentSerializers.Operate.merge_problem(
+                        paragraph_list, problem_mapping_list, [document]
+                    )
+                    workbook = DocumentSerializers.Operate.get_workbook(data_dict, document_dict)
+                    safe_name = re.sub(r'[\\/:*?"<>|]', '_', document.name or str(document.id)).strip() or str(document.id)
+                    used_names[safe_name] += 1
+                    file_name = safe_name if used_names[safe_name] == 1 else f'{safe_name}_{used_names[safe_name]}'
+                    workbook.save(os.path.join(tempdir, f'{file_name}.xlsx'))
+                zip_dir(tempdir, zip_buffer)
+            response.write(zip_buffer.getvalue())
+            return response
+
        @transaction.atomic
        def batch_delete(self, instance: Dict, with_valid=True):
            if with_valid:
@ -1539,18 +1623,28 @@ class DocumentSerializers(serializers.Serializer):
        def replace(self):
            self.is_valid(raise_exception=True)
            file = self.data.get('file')
-            source_file = QuerySet(File).filter(source_id=self.data.get('document_id')).first()
+            document = QuerySet(Document).filter(
+                id=self.data.get('document_id'),
+                knowledge_id=self.data.get('knowledge_id')
+            ).first()
+            source_file = QuerySet(File).filter(
+                source_type=FileSourceType.DOCUMENT,
+                source_id=self.data.get('document_id')
+            ).first()
+            file_content = file.read()

            if not source_file:
+                new_source_file_id = uuid.uuid7()
                # 不存在手动关联一个文档
                new_source_file_id = uuid.uuid7()
                new_source_file = File(
                    id=new_source_file_id,
                    file_name=file.name,
+                    file_size=file.size,
                    source_type=FileSourceType.DOCUMENT,
                    source_id=self.data.get('document_id'),
                )
-                new_source_file.save(file.read())
+                new_source_file.save(file_content)
                # 更新Document的meta字段
                QuerySet(Document).filter(id=self.data.get('document_id')).update(
                    meta=Func(
@ -1563,21 +1657,44 @@ class DocumentSerializers(serializers.Serializer):
                    )
                )
            else:
-                # 获取原文件的sha256_hash
-                original_hash = source_file.sha256_hash
+                source_file.file_name = file.name
+                source_file.file_size = file.size
+                source_file.save(file_content)

-                # 读取新文件内容
-                file_content = file.read()
+            file.seek(0)
+            parsed_documents = DocumentSerializers.Split(
+                data={'workspace_id': self.data.get('workspace_id'), 'knowledge_id': self.data.get('knowledge_id')}
+            ).file_to_paragraph(file, None, None, 4096)
+            paragraphs = []
+            for parsed_document in parsed_documents:
+                paragraphs.extend(parsed_document.get('content', []))

-                # 查找所有具有相同sha256_hash的文件
-                files_to_update = QuerySet(File).filter(
-                    sha256_hash=original_hash,
-                    source_id__in=[self.data.get('knowledge_id'), self.data.get('document_id')]
-                )
+            paragraph_ids = QuerySet(Paragraph).filter(document_id=document.id).values_list("id", flat=True)
+            delete_problems_and_mappings(paragraph_ids)
+            QuerySet(Paragraph).filter(document_id=document.id).delete()
+            delete_embedding_by_document(document.id)

-                # 更新所有相同hash的文件
-                for file_obj in files_to_update:
-                    file_obj.save(file_content)
+            document.char_length = reduce(lambda x, y: x + y, [len(p.get('content', '')) for p in paragraphs], 0)
+            document.name = file.name[:150]
+            document.save()
+
+            document_paragraph_model = DocumentSerializers.Create.get_paragraph_model(document, paragraphs)
+            paragraph_model_list = document_paragraph_model.get('paragraph_model_list')
+            problem_paragraph_object_list = document_paragraph_model.get('problem_paragraph_object_list')
+            problem_model_list, problem_paragraph_mapping_list = ProblemParagraphManage(
+                problem_paragraph_object_list, document.knowledge_id
+            ).to_problem_model_list()
+            if len(paragraph_model_list) > 0:
+                for i, paragraph in enumerate(paragraph_model_list):
+                    paragraph.position = i + 1
+                QuerySet(Paragraph).bulk_create(paragraph_model_list)
+            bulk_create_in_batches(Problem, problem_model_list, batch_size=1000)
+            bulk_create_in_batches(ProblemParagraphMapping, problem_paragraph_mapping_list, batch_size=1000)
+            DocumentSerializers.Operate(data={
+                'workspace_id': self.data.get('workspace_id'),
+                'knowledge_id': self.data.get('knowledge_id'),
+                'document_id': self.data.get('document_id')
+            }).refresh(with_valid=True)

            return True

--- a/apps/knowledge/urls.py
+++ b/apps/knowledge/urls.py
@ -31,6 +31,7 @@ urlpatterns = [
    path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_sync', views.DocumentView.BatchSync.as_view()),
    path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_delete', views.DocumentView.BatchDelete.as_view()),
    path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_refresh', views.DocumentView.BatchRefresh.as_view()),
+    path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_export_zip', views.DocumentView.BatchExportZip.as_view()),
    path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_generate_related', views.DocumentView.BatchGenerateRelated.as_view()),
    path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/web', views.WebDocumentView.as_view()),
    path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/qa', views.QaDocumentView.as_view()),
--- a/apps/knowledge/views/document.py
+++ b/apps/knowledge/views/document.py
@ -665,6 +665,29 @@ class DocumentView(APIView):
                'workspace_id': workspace_id, 'document_id': document_id, 'knowledge_id': knowledge_id
            }).export_zip()

+    class BatchExportZip(APIView):
+        authentication_classes = [TokenAuth]
+
+        @has_permissions(
+            PermissionConstants.KNOWLEDGE_DOCUMENT_EXPORT.get_workspace_knowledge_permission(),
+            PermissionConstants.KNOWLEDGE_DOCUMENT_EXPORT.get_workspace_permission_workspace_manage_role(),
+            RoleConstants.WORKSPACE_MANAGE.get_workspace_role(),
+            ViewPermission([RoleConstants.USER.get_workspace_role()],
+                           [PermissionConstants.KNOWLEDGE.get_workspace_knowledge_permission()], CompareConstants.AND),
+        )
+        @log(
+            menu='document', operate="Batch export documents",
+            get_operation_object=lambda r, keywords: get_knowledge_document_operation_object(
+                get_knowledge_operation_object(keywords.get('knowledge_id')),
+                get_document_operation_object_batch(r.data.get('document_id_list'))
+            ),
+        )
+        def post(self, request: Request, workspace_id: str, knowledge_id: str):
+            return DocumentSerializers.Batch(data={
+                'workspace_id': workspace_id,
+                'knowledge_id': knowledge_id
+            }).batch_export_zip(request.data)
+
    class DownloadSourceFile(APIView):
        authentication_classes = [TokenAuth]

--- a/ui/src/api/knowledge/document.ts
+++ b/ui/src/api/knowledge/document.ts
@ -1,5 +1,5 @@
 import { Result } from '@/request/Result'
-import { del, exportExcel, exportFile, get, post, put } from '@/request/index'
+import { del, exportExcel, exportExcelPost, exportFile, get, post, put } from '@/request/index'
 import type { Ref } from 'vue'
 import type { KeyValue, pageRequest } from '@/api/type/common'

@ -195,6 +195,21 @@ const exportDocumentZip: (
  )
 }

+const exportMulDocumentZip: (
+  document_name: string,
+  knowledge_id: string,
+  document_id_list: string[],
+  loading?: Ref<boolean>,
+) => Promise<any> = (document_name, knowledge_id, document_id_list, loading) => {
+  return exportExcelPost(
+    document_name.trim() + '.zip',
+    `${prefix.value}/${knowledge_id}/document/batch_export_zip`,
+    {},
+    { document_id_list },
+    loading,
+  )
+}
+
 /**
 * 刷新文档向量库
 * @param 参数
@ -619,6 +634,7 @@ export default {
  postReplaceSourceFile,
  exportDocument,
  exportDocumentZip,
+  exportMulDocumentZip,
  putDocumentRefresh,
  putDocumentSync,
  putMulDocument,
--- a/ui/src/components/resource-authorization-drawer/index.vue
+++ b/ui/src/components/resource-authorization-drawer/index.vue
@ -128,7 +128,7 @@
      @close="closeSingleSelectDialog"
      width="500px"
    >
-      <el-radio-group v-model="authAllChildren" class="radio-block">
+      <el-radio-group v-if="isFolder" v-model="authAllChildren" class="radio-block">
        <el-radio :value="false">
          <p class="color-text-primary lighter">
            {{ $t('views.system.resourceAuthorization.setting.currentOnly') }}
@ -140,6 +140,9 @@
          </p>
        </el-radio>
      </el-radio-group>
+      <p v-else class="color-text-primary lighter">
+        {{ $t('common.confirm') }}
+      </p>
      <template #footer>
        <div class="dialog-footer mt-24">
          <el-button @click="closeSingleSelectDialog">{{ $t('common.cancel') }}</el-button>
@ -415,18 +418,8 @@ function closeDialog() {
 }

 function permissionsHandle(val: any, row: any) {
-  if (props.isFolder) {
  singleSelectDialogVisible.value = true
  pendingPermissionChange.value = { val, row }
-    return
-  }
-  const obj = [
-    {
-      user_id: row.id,
-      permission: val,
-    },
-  ]
-  submitPermissions(obj)
 }

 function submitPermissions(obj: any) {
--- a/ui/src/views/document/index.vue
+++ b/ui/src/views/document/index.vue
@ -77,6 +77,14 @@
                        v-if="permissionPrecise.doc_tag(id)"
                        >{{ $t('views.document.tag.addTag') }}
                      </el-dropdown-item>
+                      <el-dropdown-item
+                        divided
+                        @click="exportMulDocumentZip"
+                        :disabled="multipleSelection.length === 0"
+                        v-if="permissionPrecise.doc_export(id)"
+                      >
+                        {{ $t('views.document.setting.export') }} Zip
+                      </el-dropdown-item>
                      <el-dropdown-item
                        divided
                        @click="syncMulDocument"
@ -783,6 +791,7 @@ const MoreFilledPermission0 = (id: string) => {
    permissionPrecise.value.doc_migrate(id) ||
    (knowledgeDetail?.value.type === 1 && permissionPrecise.value.doc_sync(id)) ||
    (knowledgeDetail?.value.type === 2 && permissionPrecise.value.doc_sync(id)) ||
+    permissionPrecise.value.doc_export(id) ||
    permissionPrecise.value.doc_delete(id) || permissionPrecise.value.doc_tag(id)
  )
 }
@ -1112,6 +1121,16 @@ function deleteMulDocument() {
    .catch(() => {})
 }

+function exportMulDocumentZip() {
+  const arr: string[] = multipleSelection.value.map((v) => v.id)
+  loadSharedApi({ type: 'document', systemType: apiType.value })
+    .exportMulDocumentZip(knowledgeDetail.value?.name || 'documents', id, arr, loading)
+    .then(() => {
+      MsgSuccess(t('common.exportSuccess'))
+      multipleTableRef.value?.clearSelection()
+    })
+}
+
 function batchRefresh() {
  const arr: string[] = multipleSelection.value.map((v) => v.id)
  const embeddingBatchDocument = (stateList: Array<string>) => {