From a9443a638c8c421930df9a6559824abc763aa7a8 Mon Sep 17 00:00:00 2001
From: shaohuzhang1 <shaohu.zhang@fit2cloud.com>
Date: Tue, 27 Aug 2024 14:14:51 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=B8=8A=E4=BC=A0?=
 =?UTF-8?q?=E6=96=87=E6=A1=A3=E4=B8=AD=E5=90=8E=E7=BC=80=E4=B8=BAPDF=20?=
 =?UTF-8?q?=E4=B8=8D=E8=AF=86=E5=88=AB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 apps/common/handle/impl/doc_split_handle.py  | 3 ++-
 apps/common/handle/impl/html_split_handle.py | 2 +-
 apps/common/handle/impl/pdf_split_handle.py  | 5 +++--
 apps/common/handle/impl/text_split_handle.py | 3 ++-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py
index 25f5d694a..8cd08d7ee 100644
--- a/apps/common/handle/impl/doc_split_handle.py
+++ b/apps/common/handle/impl/doc_split_handle.py
@@ -149,6 +149,7 @@ class DocSplitHandle(BaseSplitHandle):
 
     def support(self, file, get_buffer):
         file_name: str = file.name.lower()
-        if file_name.endswith(".docx") or file_name.endswith(".doc"):
+        if file_name.endswith(".docx") or file_name.endswith(".doc") or file_name.endswith(
+                ".DOC") or file_name.endswith(".DOCX"):
             return True
         return False
diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py
index 3116aabfd..878d9edda 100644
--- a/apps/common/handle/impl/html_split_handle.py
+++ b/apps/common/handle/impl/html_split_handle.py
@@ -38,7 +38,7 @@ def get_encoding(buffer):
 class HTMLSplitHandle(BaseSplitHandle):
     def support(self, file, get_buffer):
         file_name: str = file.name.lower()
-        if file_name.endswith(".html"):
+        if file_name.endswith(".html") or file_name.endswith(".HTML"):
             return True
         return False
 
diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py
index ddf6cb2aa..b242292ff 100644
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@@ -74,7 +74,8 @@ class PdfSplitHandle(BaseSplitHandle):
 
                 elapsed_time = time.time() - start_time
                 # todo 实现进度条代替下面的普通输出
-                max_kb.debug(f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s,   content-length: {len(page_content)}")
+                max_kb.debug(
+                    f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s,   content-length: {len(page_content)}")
             if pattern_list is not None and len(pattern_list) > 0:
                 split_model = SplitModel(pattern_list, with_filter, limit)
             else:
@@ -96,6 +97,6 @@ class PdfSplitHandle(BaseSplitHandle):
 
     def support(self, file, get_buffer):
         file_name: str = file.name.lower()
-        if file_name.endswith(".pdf"):
+        if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
             return True
         return False
diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py
index a773b3bbb..467607ff5 100644
--- a/apps/common/handle/impl/text_split_handle.py
+++ b/apps/common/handle/impl/text_split_handle.py
@@ -26,7 +26,8 @@ class TextSplitHandle(BaseSplitHandle):
     def support(self, file, get_buffer):
         buffer = get_buffer(file)
         file_name: str = file.name.lower()
-        if file_name.endswith(".md") or file_name.endswith('.txt'):
+        if file_name.endswith(".md") or file_name.endswith('.txt') or file_name.endswith('.TXT') or file_name.endswith(
+                '.MD'):
             return True
         result = detect(buffer)
         if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \