UnisMindMap/mineru/utils/pdf_text_tool.py

40 lines
1.3 KiB
Python

from typing import List
import math
import pypdfium2 as pdfium
from pdftext.pdf.chars import get_chars, deduplicate_chars
from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks
def get_page(
page: pdfium.PdfPage,
quote_loosebox: bool =True,
superscript_height_threshold: float = 0.7,
line_distance_threshold: float = 0.1,
) -> dict:
textpage = page.get_textpage()
page_bbox: List[float] = page.get_bbox()
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
page_rotation = 0
try:
page_rotation = page.get_rotation()
except:
pass
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
lines = get_lines(spans)
assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
blocks = get_blocks(lines)
page = {
"bbox": page_bbox,
"width": page_width,
"height": page_height,
"rotation": page_rotation,
"blocks": blocks
}
return page