UnisKB/apps/common/handle/impl/text/html_split_handle.py

82 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# coding=utf-8
"""
@project: maxkb
@Author
@file html_split_handle.py
@date2024/5/23 10:58
@desc:
"""
import re
import traceback
from typing import List
from bs4 import BeautifulSoup
from charset_normalizer import detect
from html2text import html2text
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
from common.utils.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
def get_encoding(buffer):
beautiful_soup = BeautifulSoup(buffer, "html.parser")
meta_list = beautiful_soup.find_all('meta')
charset_list = [meta.attrs.get('charset') for meta in meta_list if
meta.attrs is not None and 'charset' in meta.attrs]
if len(charset_list) > 0:
charset = charset_list[0]
return charset
return detect(buffer)['encoding']
class HTMLSplitHandle(BaseSplitHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".html") or file_name.endswith(".HTML"):
return True
return False
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
if type(limit) is str:
limit = int(limit)
if type(with_filter) is str:
with_filter = with_filter.lower() == 'true'
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
content = html2text(content)
except BaseException as e:
maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
return {
'name': file.name, 'content': []
}
return {
'name': file.name,
'content': split_model.parse(content)
}
def get_content(self, file, save_image):
buffer = file.read()
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
return html2text(content)
except BaseException as e:
traceback.print_exception(e)
return f'{e}'