UnisMindMap/mineru/utils/char_utils.py

55 lines
1.7 KiB
Python

# Copyright (c) Opendatalab. All rights reserved.
import re
def is_hyphen_at_line_end(line):
"""Check if a line ends with one or more letters followed by a hyphen.
Args:
line (str): The line of text to check.
Returns:
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
"""
# Use regex to check if the line ends with one or more letters followed by a hyphen
return bool(re.search(r'[A-Za-z]+-\s*$', line))
def full_to_half_exclude_marks(text: str) -> str:
"""Convert full-width characters to half-width characters using code point manipulation.
Args:
text: String containing full-width characters
Returns:
String with full-width characters converted to half-width
"""
result = []
for char in text:
code = ord(char)
# Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
else:
result.append(char)
return ''.join(result)
def full_to_half(text: str) -> str:
"""Convert full-width characters to half-width characters using code point manipulation.
Args:
text: String containing full-width characters
Returns:
String with full-width characters converted to half-width
"""
result = []
for char in text:
code = ord(char)
# Full-width letters, numbers and punctuation (FF01-FF5E)
if 0xFF01 <= code <= 0xFF5E:
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
else:
result.append(char)
return ''.join(result)