diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py index 25f5d694a..8cd08d7ee 100644 --- a/apps/common/handle/impl/doc_split_handle.py +++ b/apps/common/handle/impl/doc_split_handle.py @@ -149,6 +149,7 @@ class DocSplitHandle(BaseSplitHandle): def support(self, file, get_buffer): file_name: str = file.name.lower() - if file_name.endswith(".docx") or file_name.endswith(".doc"): + if file_name.endswith(".docx") or file_name.endswith(".doc") or file_name.endswith( + ".DOC") or file_name.endswith(".DOCX"): return True return False diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py index 3116aabfd..878d9edda 100644 --- a/apps/common/handle/impl/html_split_handle.py +++ b/apps/common/handle/impl/html_split_handle.py @@ -38,7 +38,7 @@ def get_encoding(buffer): class HTMLSplitHandle(BaseSplitHandle): def support(self, file, get_buffer): file_name: str = file.name.lower() - if file_name.endswith(".html"): + if file_name.endswith(".html") or file_name.endswith(".HTML"): return True return False diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index ddf6cb2aa..b242292ff 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -74,7 +74,8 @@ class PdfSplitHandle(BaseSplitHandle): elapsed_time = time.time() - start_time # todo 实现进度条代替下面的普通输出 - max_kb.debug(f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}") + max_kb.debug( + f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s, content-length: {len(page_content)}") if pattern_list is not None and len(pattern_list) > 0: split_model = SplitModel(pattern_list, with_filter, limit) else: @@ -96,6 +97,6 @@ class PdfSplitHandle(BaseSplitHandle): def support(self, file, get_buffer): file_name: str = file.name.lower() - if file_name.endswith(".pdf"): + if file_name.endswith(".pdf") or file_name.endswith(".PDF"): return True return False diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py index a773b3bbb..467607ff5 100644 --- a/apps/common/handle/impl/text_split_handle.py +++ b/apps/common/handle/impl/text_split_handle.py @@ -26,7 +26,8 @@ class TextSplitHandle(BaseSplitHandle): def support(self, file, get_buffer): buffer = get_buffer(file) file_name: str = file.name.lower() - if file_name.endswith(".md") or file_name.endswith('.txt'): + if file_name.endswith(".md") or file_name.endswith('.txt') or file_name.endswith('.TXT') or file_name.endswith( + '.MD'): return True result = detect(buffer) if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \