fix: 修复知识库的1、3、5、6、7、8、10

v3.2
panyy 2026-06-11 11:25:45 +08:00
parent 29d3110799
commit 9bddff2914
6 changed files with 218 additions and 49 deletions

View File

@ -103,6 +103,11 @@ class BatchCancelInstanceSerializer(serializers.Serializer):
raise AppApiException(500, _('task type not support'))
class BatchDocumentExportSerializer(serializers.Serializer):
document_id_list = serializers.ListField(required=True, allow_empty=False, child=serializers.UUIDField(required=True),
label=_('document id list'))
class DocumentInstanceSerializer(serializers.Serializer):
name = serializers.CharField(required=True, label=_('document name'), max_length=128, min_length=1,
source=_('document name'))
@ -560,19 +565,25 @@ class DocumentSerializers(serializers.Serializer):
if not query_set.exists():
raise AppApiException(500, _('Knowledge id does not exist'))
document_id = self.data.get('document_id')
if not QuerySet(Document).filter(id=document_id).exists():
if not QuerySet(Document).filter(id=document_id, knowledge_id=self.data.get('knowledge_id')).exists():
raise AppApiException(500, _('document id not exist'))
def export(self, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
document = QuerySet(Document).filter(id=self.data.get("document_id")).first()
paragraph_list = native_search(QuerySet(Paragraph).filter(document_id=self.data.get("document_id")),
document = QuerySet(Document).filter(
id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
).first()
paragraph_list = native_search(QuerySet(Paragraph).filter(
document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
),
get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'list_paragraph_document_name.sql')))
problem_mapping_list = native_search(
QuerySet(ProblemParagraphMapping).filter(document_id=self.data.get("document_id")), get_file_content(
QuerySet(ProblemParagraphMapping).filter(
document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
), get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_problem_mapping.sql')),
with_table_name=True)
data_dict, document_dict = self.merge_problem(paragraph_list, problem_mapping_list, [document])
@ -585,13 +596,19 @@ class DocumentSerializers(serializers.Serializer):
def export_zip(self, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
document = QuerySet(Document).filter(id=self.data.get("document_id")).first()
paragraph_list = native_search(QuerySet(Paragraph).filter(document_id=self.data.get("document_id")),
document = QuerySet(Document).filter(
id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
).first()
paragraph_list = native_search(QuerySet(Paragraph).filter(
document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
),
get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'list_paragraph_document_name.sql')))
problem_mapping_list = native_search(
QuerySet(ProblemParagraphMapping).filter(document_id=self.data.get("document_id")), get_file_content(
QuerySet(ProblemParagraphMapping).filter(
document_id=self.data.get("document_id"), knowledge_id=self.data.get("knowledge_id")
), get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_problem_mapping.sql')),
with_table_name=True)
data_dict, document_dict = self.merge_problem(paragraph_list, problem_mapping_list, [document])
@ -612,7 +629,20 @@ class DocumentSerializers(serializers.Serializer):
def download_source_file(self):
self.is_valid(raise_exception=True)
file = QuerySet(File).filter(source_id=self.data.get('document_id')).first()
document = QuerySet(Document).filter(
id=self.data.get('document_id'), knowledge_id=self.data.get('knowledge_id')
).first()
source_file_id = document.meta.get('source_file_id') if document and document.meta else None
file_query_set = QuerySet(File).filter(source_type=FileSourceType.DOCUMENT, source_id=self.data.get('document_id'))
file = file_query_set.filter(id=source_file_id).first() if source_file_id else None
if not file:
file = file_query_set.first()
if not file and source_file_id:
file = QuerySet(File).filter(
id=source_file_id,
source_type=FileSourceType.KNOWLEDGE,
source_id=self.data.get('knowledge_id')
).first()
if not file:
raise AppApiException(500, _('File not exist. Only manually uploaded documents are supported'))
return FileSerializer.Operate(data={'id': file.id}).get(with_valid=True)
@ -620,7 +650,10 @@ class DocumentSerializers(serializers.Serializer):
def one(self, with_valid=False):
self.is_valid(raise_exception=True)
query_set = QuerySet(model=Document)
query_set = query_set.filter(**{'id': self.data.get("document_id")})
query_set = query_set.filter(**{
'id': self.data.get("document_id"),
'knowledge_id': self.data.get("knowledge_id")
})
return native_search({
'document_custom_sql': query_set,
'order_by_query': QuerySet(Document).order_by('-create_time', 'id')
@ -1034,6 +1067,26 @@ class DocumentSerializers(serializers.Serializer):
file.source_id = self.data.get('knowledge_id')
file.save(file_bytes)
def normalize_split_result(self, result, file, split_handle, file_id):
result_list = result if isinstance(result, list) else [result]
if len(result_list) == 0:
result_list = [{'content': []}]
for index, item in enumerate(result_list):
if item is None:
item = {'content': []}
result_list[index] = item
content = item.get('content')
if not content and hasattr(split_handle, 'get_content'):
try:
file.seek(0)
raw_content = split_handle.get_content(file, self.save_image)
if raw_content:
item['content'] = [{'title': '', 'content': raw_content}]
except Exception as e:
maxkb_logger.warning(f'Fallback split content failed: {e}')
item['source_file_id'] = file_id
return result_list
def file_to_paragraph(self, file, pattern_list: List, with_filter: bool, limit: int):
# 保存源文件
file_id = uuid.uuid7()
@ -1051,19 +1104,9 @@ class DocumentSerializers(serializers.Serializer):
for split_handle in split_handles:
if split_handle.support(file, get_buffer):
result = split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
if isinstance(result, list):
for item in result:
item['source_file_id'] = file_id
return result
result['source_file_id'] = file_id
return [result]
return self.normalize_split_result(result, file, split_handle, file_id)
result = default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
if isinstance(result, list):
for item in result:
item['source_file_id'] = file_id
return result
result['source_file_id'] = file_id
return [result]
return self.normalize_split_result(result, file, default_split_handle, file_id)
class SplitPattern(serializers.Serializer):
workspace_id = serializers.CharField(required=False, label=_('workspace id'), allow_null=True)
@ -1078,6 +1121,9 @@ class DocumentSerializers(serializers.Serializer):
{'key': '####', 'value': "(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"},
{'key': '#####', 'value': "(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"},
{'key': '######', 'value': "(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"},
{'key': _('chapter'), 'value': '(?<=^)第[一二三四五六七八九十百千万0-9]+[章节篇部分].*|(?<=\\n)第[一二三四五六七八九十百千万0-9]+[章节篇部分].*'},
{'key': _('numbered heading'), 'value': '(?<=^)[0-9]+(\\.[0-9]+)*[、.]\\s*.*|(?<=\\n)[0-9]+(\\.[0-9]+)*[、.]\\s*.*'},
{'key': _('chinese numbered heading'), 'value': '(?<=^)[一二三四五六七八九十]+[、.]\\s*.*|(?<=\\n)[一二三四五六七八九十]+[、.]\\s*.*'},
{'key': '-', 'value': '(?<! )- .*'},
{'key': _('space'), 'value': '(?<! ) (?! )'},
{'key': _('semicolon'), 'value': '(?<!)(?!)'}, {'key': _('comma'), 'value': '(?<!)(?!)'},
@ -1208,6 +1254,44 @@ class DocumentSerializers(serializers.Serializer):
)
return True
def batch_export_zip(self, instance: Dict, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
BatchDocumentExportSerializer(data=instance).is_valid(raise_exception=True)
knowledge_id = self.data.get('knowledge_id')
document_id_list = instance.get('document_id_list')
document_list = list(QuerySet(Document).filter(knowledge_id=knowledge_id, id__in=document_id_list))
if len(document_list) != len(set([str(document_id) for document_id in document_id_list])):
raise AppApiException(500, _('Document id does not exist'))
response = HttpResponse(content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename="documents.zip"'
zip_buffer = io.BytesIO()
with TemporaryDirectory() as tempdir:
used_names = defaultdict(int)
for document in document_list:
paragraph_list = native_search(QuerySet(Paragraph).filter(
knowledge_id=knowledge_id, document_id=document.id
), get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_paragraph_document_name.sql')
))
problem_mapping_list = native_search(
QuerySet(ProblemParagraphMapping).filter(knowledge_id=knowledge_id, document_id=document.id),
get_file_content(os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'list_problem_mapping.sql')),
with_table_name=True)
data_dict, document_dict = DocumentSerializers.Operate.merge_problem(
paragraph_list, problem_mapping_list, [document]
)
workbook = DocumentSerializers.Operate.get_workbook(data_dict, document_dict)
safe_name = re.sub(r'[\\/:*?"<>|]', '_', document.name or str(document.id)).strip() or str(document.id)
used_names[safe_name] += 1
file_name = safe_name if used_names[safe_name] == 1 else f'{safe_name}_{used_names[safe_name]}'
workbook.save(os.path.join(tempdir, f'{file_name}.xlsx'))
zip_dir(tempdir, zip_buffer)
response.write(zip_buffer.getvalue())
return response
@transaction.atomic
def batch_delete(self, instance: Dict, with_valid=True):
if with_valid:
@ -1539,18 +1623,28 @@ class DocumentSerializers(serializers.Serializer):
def replace(self):
self.is_valid(raise_exception=True)
file = self.data.get('file')
source_file = QuerySet(File).filter(source_id=self.data.get('document_id')).first()
document = QuerySet(Document).filter(
id=self.data.get('document_id'),
knowledge_id=self.data.get('knowledge_id')
).first()
source_file = QuerySet(File).filter(
source_type=FileSourceType.DOCUMENT,
source_id=self.data.get('document_id')
).first()
file_content = file.read()
if not source_file:
new_source_file_id = uuid.uuid7()
# 不存在手动关联一个文档
new_source_file_id = uuid.uuid7()
new_source_file = File(
id=new_source_file_id,
file_name=file.name,
file_size=file.size,
source_type=FileSourceType.DOCUMENT,
source_id=self.data.get('document_id'),
)
new_source_file.save(file.read())
new_source_file.save(file_content)
# 更新Document的meta字段
QuerySet(Document).filter(id=self.data.get('document_id')).update(
meta=Func(
@ -1563,21 +1657,44 @@ class DocumentSerializers(serializers.Serializer):
)
)
else:
# 获取原文件的sha256_hash
original_hash = source_file.sha256_hash
source_file.file_name = file.name
source_file.file_size = file.size
source_file.save(file_content)
# 读取新文件内容
file_content = file.read()
file.seek(0)
parsed_documents = DocumentSerializers.Split(
data={'workspace_id': self.data.get('workspace_id'), 'knowledge_id': self.data.get('knowledge_id')}
).file_to_paragraph(file, None, None, 4096)
paragraphs = []
for parsed_document in parsed_documents:
paragraphs.extend(parsed_document.get('content', []))
# 查找所有具有相同sha256_hash的文件
files_to_update = QuerySet(File).filter(
sha256_hash=original_hash,
source_id__in=[self.data.get('knowledge_id'), self.data.get('document_id')]
)
paragraph_ids = QuerySet(Paragraph).filter(document_id=document.id).values_list("id", flat=True)
delete_problems_and_mappings(paragraph_ids)
QuerySet(Paragraph).filter(document_id=document.id).delete()
delete_embedding_by_document(document.id)
# 更新所有相同hash的文件
for file_obj in files_to_update:
file_obj.save(file_content)
document.char_length = reduce(lambda x, y: x + y, [len(p.get('content', '')) for p in paragraphs], 0)
document.name = file.name[:150]
document.save()
document_paragraph_model = DocumentSerializers.Create.get_paragraph_model(document, paragraphs)
paragraph_model_list = document_paragraph_model.get('paragraph_model_list')
problem_paragraph_object_list = document_paragraph_model.get('problem_paragraph_object_list')
problem_model_list, problem_paragraph_mapping_list = ProblemParagraphManage(
problem_paragraph_object_list, document.knowledge_id
).to_problem_model_list()
if len(paragraph_model_list) > 0:
for i, paragraph in enumerate(paragraph_model_list):
paragraph.position = i + 1
QuerySet(Paragraph).bulk_create(paragraph_model_list)
bulk_create_in_batches(Problem, problem_model_list, batch_size=1000)
bulk_create_in_batches(ProblemParagraphMapping, problem_paragraph_mapping_list, batch_size=1000)
DocumentSerializers.Operate(data={
'workspace_id': self.data.get('workspace_id'),
'knowledge_id': self.data.get('knowledge_id'),
'document_id': self.data.get('document_id')
}).refresh(with_valid=True)
return True

View File

@ -31,6 +31,7 @@ urlpatterns = [
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_sync', views.DocumentView.BatchSync.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_delete', views.DocumentView.BatchDelete.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_refresh', views.DocumentView.BatchRefresh.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_export_zip', views.DocumentView.BatchExportZip.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_generate_related', views.DocumentView.BatchGenerateRelated.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/web', views.WebDocumentView.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/qa', views.QaDocumentView.as_view()),

View File

@ -665,6 +665,29 @@ class DocumentView(APIView):
'workspace_id': workspace_id, 'document_id': document_id, 'knowledge_id': knowledge_id
}).export_zip()
class BatchExportZip(APIView):
authentication_classes = [TokenAuth]
@has_permissions(
PermissionConstants.KNOWLEDGE_DOCUMENT_EXPORT.get_workspace_knowledge_permission(),
PermissionConstants.KNOWLEDGE_DOCUMENT_EXPORT.get_workspace_permission_workspace_manage_role(),
RoleConstants.WORKSPACE_MANAGE.get_workspace_role(),
ViewPermission([RoleConstants.USER.get_workspace_role()],
[PermissionConstants.KNOWLEDGE.get_workspace_knowledge_permission()], CompareConstants.AND),
)
@log(
menu='document', operate="Batch export documents",
get_operation_object=lambda r, keywords: get_knowledge_document_operation_object(
get_knowledge_operation_object(keywords.get('knowledge_id')),
get_document_operation_object_batch(r.data.get('document_id_list'))
),
)
def post(self, request: Request, workspace_id: str, knowledge_id: str):
return DocumentSerializers.Batch(data={
'workspace_id': workspace_id,
'knowledge_id': knowledge_id
}).batch_export_zip(request.data)
class DownloadSourceFile(APIView):
authentication_classes = [TokenAuth]

View File

@ -1,5 +1,5 @@
import { Result } from '@/request/Result'
import { del, exportExcel, exportFile, get, post, put } from '@/request/index'
import { del, exportExcel, exportExcelPost, exportFile, get, post, put } from '@/request/index'
import type { Ref } from 'vue'
import type { KeyValue, pageRequest } from '@/api/type/common'
@ -195,6 +195,21 @@ const exportDocumentZip: (
)
}
const exportMulDocumentZip: (
document_name: string,
knowledge_id: string,
document_id_list: string[],
loading?: Ref<boolean>,
) => Promise<any> = (document_name, knowledge_id, document_id_list, loading) => {
return exportExcelPost(
document_name.trim() + '.zip',
`${prefix.value}/${knowledge_id}/document/batch_export_zip`,
{},
{ document_id_list },
loading,
)
}
/**
*
* @param
@ -619,6 +634,7 @@ export default {
postReplaceSourceFile,
exportDocument,
exportDocumentZip,
exportMulDocumentZip,
putDocumentRefresh,
putDocumentSync,
putMulDocument,

View File

@ -128,7 +128,7 @@
@close="closeSingleSelectDialog"
width="500px"
>
<el-radio-group v-model="authAllChildren" class="radio-block">
<el-radio-group v-if="isFolder" v-model="authAllChildren" class="radio-block">
<el-radio :value="false">
<p class="color-text-primary lighter">
{{ $t('views.system.resourceAuthorization.setting.currentOnly') }}
@ -140,6 +140,9 @@
</p>
</el-radio>
</el-radio-group>
<p v-else class="color-text-primary lighter">
{{ $t('common.confirm') }}
</p>
<template #footer>
<div class="dialog-footer mt-24">
<el-button @click="closeSingleSelectDialog">{{ $t('common.cancel') }}</el-button>
@ -415,18 +418,8 @@ function closeDialog() {
}
function permissionsHandle(val: any, row: any) {
if (props.isFolder) {
singleSelectDialogVisible.value = true
pendingPermissionChange.value = { val, row }
return
}
const obj = [
{
user_id: row.id,
permission: val,
},
]
submitPermissions(obj)
}
function submitPermissions(obj: any) {

View File

@ -77,6 +77,14 @@
v-if="permissionPrecise.doc_tag(id)"
>{{ $t('views.document.tag.addTag') }}
</el-dropdown-item>
<el-dropdown-item
divided
@click="exportMulDocumentZip"
:disabled="multipleSelection.length === 0"
v-if="permissionPrecise.doc_export(id)"
>
{{ $t('views.document.setting.export') }} Zip
</el-dropdown-item>
<el-dropdown-item
divided
@click="syncMulDocument"
@ -783,6 +791,7 @@ const MoreFilledPermission0 = (id: string) => {
permissionPrecise.value.doc_migrate(id) ||
(knowledgeDetail?.value.type === 1 && permissionPrecise.value.doc_sync(id)) ||
(knowledgeDetail?.value.type === 2 && permissionPrecise.value.doc_sync(id)) ||
permissionPrecise.value.doc_export(id) ||
permissionPrecise.value.doc_delete(id) || permissionPrecise.value.doc_tag(id)
)
}
@ -1112,6 +1121,16 @@ function deleteMulDocument() {
.catch(() => {})
}
function exportMulDocumentZip() {
const arr: string[] = multipleSelection.value.map((v) => v.id)
loadSharedApi({ type: 'document', systemType: apiType.value })
.exportMulDocumentZip(knowledgeDetail.value?.name || 'documents', id, arr, loading)
.then(() => {
MsgSuccess(t('common.exportSuccess'))
multipleTableRef.value?.clearSelection()
})
}
function batchRefresh() {
const arr: string[] = multipleSelection.value.map((v) => v.id)
const embeddingBatchDocument = (stateList: Array<string>) => {