From a9443a638c8c421930df9a6559824abc763aa7a8 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Tue, 27 Aug 2024 14:14:51 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=B8=8A=E4=BC=A0?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E4=B8=AD=E5=90=8E=E7=BC=80=E4=B8=BAPDF=20?= =?UTF-8?q?=E4=B8=8D=E8=AF=86=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/impl/doc_split_handle.py | 3 ++- apps/common/handle/impl/html_split_handle.py | 2 +- apps/common/handle/impl/pdf_split_handle.py | 5 +++-- apps/common/handle/impl/text_split_handle.py | 3 ++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py index 25f5d694a..8cd08d7ee 100644 --- a/apps/common/handle/impl/doc_split_handle.py +++ b/apps/common/handle/impl/doc_split_handle.py @@ -149,6 +149,7 @@ class DocSplitHandle(BaseSplitHandle): def support(self, file, get_buffer): file_name: str = file.name.lower() - if file_name.endswith(".docx") or file_name.endswith(".doc"): + if file_name.endswith(".docx") or file_name.endswith(".doc") or file_name.endswith( + ".DOC") or file_name.endswith(".DOCX"): return True return False diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py index 3116aabfd..878d9edda 100644 --- a/apps/common/handle/impl/html_split_handle.py +++ b/apps/common/handle/impl/html_split_handle.py @@ -38,7 +38,7 @@ def get_encoding(buffer): class HTMLSplitHandle(BaseSplitHandle): def support(self, file, get_buffer): file_name: str = file.name.lower() - if file_name.endswith(".html"): + if file_name.endswith(".html") or file_name.endswith(".HTML"): return True return False diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index ddf6cb2aa..b242292ff 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -74,7 +74,8 @@ class PdfSplitHandle(BaseSplitHandle): elapsed_time = time.time() - start_time # todo 实现进度条代替下面的普通输出 - max_kb.debug(f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}") + max_kb.debug( + f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}") if pattern_list is not None and len(pattern_list) > 0: split_model = SplitModel(pattern_list, with_filter, limit) else: @@ -96,6 +97,6 @@ class PdfSplitHandle(BaseSplitHandle): def support(self, file, get_buffer): file_name: str = file.name.lower() - if file_name.endswith(".pdf"): + if file_name.endswith(".pdf") or file_name.endswith(".PDF"): return True return False diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py index a773b3bbb..467607ff5 100644 --- a/apps/common/handle/impl/text_split_handle.py +++ b/apps/common/handle/impl/text_split_handle.py @@ -26,7 +26,8 @@ class TextSplitHandle(BaseSplitHandle): def support(self, file, get_buffer): buffer = get_buffer(file) file_name: str = file.name.lower() - if file_name.endswith(".md") or file_name.endswith('.txt'): + if file_name.endswith(".md") or file_name.endswith('.txt') or file_name.endswith('.TXT') or file_name.endswith( + '.MD'): return True result = detect(buffer) if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \