UnisMindMap/mineru/utils/draw_bbox.py

490 lines
20 KiB
Python

import json
from io import BytesIO
from loguru import logger
from pypdf import PdfReader, PdfWriter, PageObject
from reportlab.pdfgen import canvas
from .enum_class import BlockType, ContentType, SplitFlag
def cal_canvas_rect(page, bbox):
"""
Calculate the rectangle coordinates on the canvas based on the original PDF page and bounding box.
Args:
page: A PyPDF2 Page object representing a single page in the PDF.
bbox: [x0, y0, x1, y1] representing the bounding box coordinates.
Returns:
rect: [x0, y0, width, height] representing the rectangle coordinates on the canvas.
"""
page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
actual_width = page_width # The width of the final PDF display
actual_height = page_height # The height of the final PDF display
rotation_obj = page.get("/Rotate", 0)
try:
rotation = int(rotation_obj) % 360 # cast rotation to int to handle IndirectObject
except (ValueError, TypeError) as e:
logger.warning(f"Invalid /Rotate value {rotation_obj!r} on page; defaulting to 0. Error: {e}")
rotation = 0
if rotation in [90, 270]:
# PDF is rotated 90 degrees or 270 degrees, and the width and height need to be swapped
actual_width, actual_height = actual_height, actual_width
x0, y0, x1, y1 = bbox
rect_w = abs(x1 - x0)
rect_h = abs(y1 - y0)
if rotation == 270:
rect_w, rect_h = rect_h, rect_w
x0 = actual_height - y1
y0 = actual_width - x1
elif rotation == 180:
x0 = page_width - x1
# y0 stays the same
elif rotation == 90:
rect_w, rect_h = rect_h, rect_w
x0, y0 = y0, x0
else:
# rotation == 0
y0 = page_height - y1
rect = [x0, y0, rect_w, rect_h]
return rect
def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config):
new_rgb = [float(color) / 255 for color in rgb_config]
page_data = bbox_list[i]
for bbox in page_data:
rect = cal_canvas_rect(page, bbox) # Define the rectangle
if fill_config: # filled rectangle
c.setFillColorRGB(new_rgb[0], new_rgb[1], new_rgb[2], 0.3)
c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
else: # bounding box
c.setStrokeColorRGB(new_rgb[0], new_rgb[1], new_rgb[2])
c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
return c
def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_bbox=True):
new_rgb = [float(color) / 255 for color in rgb_config]
page_data = bbox_list[i]
# 强制转换为 float
page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
for j, bbox in enumerate(page_data):
# 确保bbox的每个元素都是float
rect = cal_canvas_rect(page, bbox) # Define the rectangle
if draw_bbox:
if fill_config:
c.setFillColorRGB(*new_rgb, 0.3)
c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
else:
c.setStrokeColorRGB(*new_rgb)
c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
c.setFillColorRGB(*new_rgb, 1.0)
c.setFontSize(size=10)
c.saveState()
rotation_obj = page.get("/Rotate", 0)
try:
rotation = int(rotation_obj) % 360 # cast rotation to int to handle IndirectObject
except (ValueError, TypeError):
logger.warning(f"Invalid /Rotate value: {rotation_obj!r}, defaulting to 0")
rotation = 0
if rotation == 0:
c.translate(rect[0] + rect[2] + 2, rect[1] + rect[3] - 10)
elif rotation == 90:
c.translate(rect[0] + 10, rect[1] + rect[3] + 2)
elif rotation == 180:
c.translate(rect[0] - 2, rect[1] + 10)
elif rotation == 270:
c.translate(rect[0] + rect[2] - 10, rect[1] - 2)
c.rotate(rotation)
c.drawString(0, 0, str(j + 1))
c.restoreState()
return c
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
dropped_bbox_list = []
tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
codes_body_list, codes_caption_list = [], []
titles_list = []
texts_list = []
interequations_list = []
lists_list = []
list_items_list = []
indexs_list = []
for page in pdf_info:
page_dropped_list = []
tables_body, tables_caption, tables_footnote = [], [], []
imgs_body, imgs_caption, imgs_footnote = [], [], []
codes_body, codes_caption = [], []
titles = []
texts = []
interequations = []
lists = []
list_items = []
indices = []
for dropped_bbox in page['discarded_blocks']:
page_dropped_list.append(dropped_bbox['bbox'])
dropped_bbox_list.append(page_dropped_list)
for block in page["para_blocks"]:
bbox = block["bbox"]
if block["type"] == BlockType.TABLE:
for nested_block in block["blocks"]:
bbox = nested_block["bbox"]
if nested_block["type"] == BlockType.TABLE_BODY:
tables_body.append(bbox)
elif nested_block["type"] == BlockType.TABLE_CAPTION:
tables_caption.append(bbox)
elif nested_block["type"] == BlockType.TABLE_FOOTNOTE:
if nested_block.get(SplitFlag.CROSS_PAGE, False):
continue
tables_footnote.append(bbox)
elif block["type"] == BlockType.IMAGE:
for nested_block in block["blocks"]:
bbox = nested_block["bbox"]
if nested_block["type"] == BlockType.IMAGE_BODY:
imgs_body.append(bbox)
elif nested_block["type"] == BlockType.IMAGE_CAPTION:
imgs_caption.append(bbox)
elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
imgs_footnote.append(bbox)
elif block["type"] == BlockType.CODE:
for nested_block in block["blocks"]:
if nested_block["type"] == BlockType.CODE_BODY:
bbox = nested_block["bbox"]
codes_body.append(bbox)
elif nested_block["type"] == BlockType.CODE_CAPTION:
bbox = nested_block["bbox"]
codes_caption.append(bbox)
elif block["type"] == BlockType.TITLE:
titles.append(bbox)
elif block["type"] in [BlockType.TEXT, BlockType.REF_TEXT]:
texts.append(bbox)
elif block["type"] == BlockType.INTERLINE_EQUATION:
interequations.append(bbox)
elif block["type"] == BlockType.LIST:
lists.append(bbox)
if "blocks" in block:
for sub_block in block["blocks"]:
list_items.append(sub_block["bbox"])
elif block["type"] == BlockType.INDEX:
indices.append(bbox)
tables_body_list.append(tables_body)
tables_caption_list.append(tables_caption)
tables_footnote_list.append(tables_footnote)
imgs_body_list.append(imgs_body)
imgs_caption_list.append(imgs_caption)
imgs_footnote_list.append(imgs_footnote)
titles_list.append(titles)
texts_list.append(texts)
interequations_list.append(interequations)
lists_list.append(lists)
list_items_list.append(list_items)
indexs_list.append(indices)
codes_body_list.append(codes_body)
codes_caption_list.append(codes_caption)
layout_bbox_list = []
table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3}
for page in pdf_info:
page_block_list = []
for block in page["para_blocks"]:
if block["type"] in [
BlockType.TEXT,
BlockType.REF_TEXT,
BlockType.TITLE,
BlockType.INTERLINE_EQUATION,
BlockType.LIST,
BlockType.INDEX,
]:
bbox = block["bbox"]
page_block_list.append(bbox)
elif block["type"] in [BlockType.IMAGE]:
for sub_block in block["blocks"]:
bbox = sub_block["bbox"]
page_block_list.append(bbox)
elif block["type"] in [BlockType.TABLE]:
sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]])
for sub_block in sorted_blocks:
if sub_block.get(SplitFlag.CROSS_PAGE, False):
continue
bbox = sub_block["bbox"]
page_block_list.append(bbox)
elif block["type"] in [BlockType.CODE]:
for sub_block in block["blocks"]:
bbox = sub_block["bbox"]
page_block_list.append(bbox)
layout_bbox_list.append(page_block_list)
pdf_bytes_io = BytesIO(pdf_bytes)
pdf_docs = PdfReader(pdf_bytes_io)
output_pdf = PdfWriter()
for i, page in enumerate(pdf_docs.pages):
# 获取原始页面尺寸
page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
custom_page_size = (page_width, page_height)
packet = BytesIO()
# 使用原始PDF的尺寸创建canvas
c = canvas.Canvas(packet, pagesize=custom_page_size)
c = draw_bbox_without_number(i, codes_body_list, page, c, [102, 0, 204], True)
c = draw_bbox_without_number(i, codes_caption_list, page, c, [204, 153, 255], True)
c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True)
c = draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)
c = draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)
c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
c = draw_bbox_without_number(i, list_items_list, page, c, [40, 169, 92], False)
c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)
c.save()
packet.seek(0)
overlay_pdf = PdfReader(packet)
# 添加检查确保overlay_pdf.pages不为空
if len(overlay_pdf.pages) > 0:
new_page = PageObject(pdf=None)
new_page.update(page)
page = new_page
page.merge_page(overlay_pdf.pages[0])
else:
# 记录日志并继续处理下一个页面
# logger.warning(f"layout.pdf: 第{i + 1}页未能生成有效的overlay PDF")
pass
output_pdf.add_page(page)
# 保存结果
with open(f"{out_path}/{filename}", "wb") as f:
output_pdf.write(f)
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
text_list = []
inline_equation_list = []
interline_equation_list = []
image_list = []
table_list = []
dropped_list = []
def get_span_info(span):
if span['type'] == ContentType.TEXT:
page_text_list.append(span['bbox'])
elif span['type'] == ContentType.INLINE_EQUATION:
page_inline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.INTERLINE_EQUATION:
page_interline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.IMAGE:
page_image_list.append(span['bbox'])
elif span['type'] == ContentType.TABLE:
page_table_list.append(span['bbox'])
for page in pdf_info:
page_text_list = []
page_inline_equation_list = []
page_interline_equation_list = []
page_image_list = []
page_table_list = []
page_dropped_list = []
# 构造dropped_list
for block in page['discarded_blocks']:
if block['type'] == BlockType.DISCARDED:
for line in block['lines']:
for span in line['spans']:
page_dropped_list.append(span['bbox'])
dropped_list.append(page_dropped_list)
# 构造其余useful_list
# for block in page['para_blocks']: # span直接用分段合并前的结果就可以
for block in page['preproc_blocks']:
if block['type'] in [
BlockType.TEXT,
BlockType.TITLE,
BlockType.INTERLINE_EQUATION,
BlockType.LIST,
BlockType.INDEX,
]:
for line in block['lines']:
for span in line['spans']:
get_span_info(span)
elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
for sub_block in block['blocks']:
for line in sub_block['lines']:
for span in line['spans']:
get_span_info(span)
text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list)
interline_equation_list.append(page_interline_equation_list)
image_list.append(page_image_list)
table_list.append(page_table_list)
pdf_bytes_io = BytesIO(pdf_bytes)
pdf_docs = PdfReader(pdf_bytes_io)
output_pdf = PdfWriter()
for i, page in enumerate(pdf_docs.pages):
# 获取原始页面尺寸
page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
custom_page_size = (page_width, page_height)
packet = BytesIO()
# 使用原始PDF的尺寸创建canvas
c = canvas.Canvas(packet, pagesize=custom_page_size)
# 获取当前页面的数据
draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False)
draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False)
draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False)
draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False)
draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False)
draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False)
c.save()
packet.seek(0)
overlay_pdf = PdfReader(packet)
# 添加检查确保overlay_pdf.pages不为空
if len(overlay_pdf.pages) > 0:
new_page = PageObject(pdf=None)
new_page.update(page)
page = new_page
page.merge_page(overlay_pdf.pages[0])
else:
# 记录日志并继续处理下一个页面
# logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
pass
output_pdf.add_page(page)
# Save the PDF
with open(f"{out_path}/{filename}", "wb") as f:
output_pdf.write(f)
def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list = []
for page in pdf_info:
page_line_list = []
for block in page['preproc_blocks']:
if block['type'] in [BlockType.TEXT]:
for line in block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
elif block['type'] in [BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
if 'virtual_lines' in block:
if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
for line in block['virtual_lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
else:
for line in block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
for sub_block in block['blocks']:
if sub_block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
for line in sub_block['virtual_lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
else:
for line in sub_block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
elif sub_block['type'] in [BlockType.IMAGE_CAPTION, BlockType.TABLE_CAPTION, BlockType.IMAGE_FOOTNOTE, BlockType.TABLE_FOOTNOTE]:
for line in sub_block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
pdf_bytes_io = BytesIO(pdf_bytes)
pdf_docs = PdfReader(pdf_bytes_io)
output_pdf = PdfWriter()
for i, page in enumerate(pdf_docs.pages):
# 获取原始页面尺寸
page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
custom_page_size = (page_width, page_height)
packet = BytesIO()
# 使用原始PDF的尺寸创建canvas
c = canvas.Canvas(packet, pagesize=custom_page_size)
# 获取当前页面的数据
draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False)
c.save()
packet.seek(0)
overlay_pdf = PdfReader(packet)
# 添加检查确保overlay_pdf.pages不为空
if len(overlay_pdf.pages) > 0:
new_page = PageObject(pdf=None)
new_page.update(page)
page = new_page
page.merge_page(overlay_pdf.pages[0])
else:
# 记录日志并继续处理下一个页面
# logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF")
pass
output_pdf.add_page(page)
# Save the PDF
with open(f"{out_path}/{filename}", "wb") as f:
output_pdf.write(f)
if __name__ == "__main__":
# 读取PDF文件
pdf_path = "examples/demo1.pdf"
with open(pdf_path, "rb") as f:
pdf_bytes = f.read()
# 从json文件读取pdf_info
json_path = "examples/demo1_1746005777.0863056_middle.json"
with open(json_path, "r", encoding="utf-8") as f:
pdf_ann = json.load(f)
pdf_info = pdf_ann["pdf_info"]
# 调用可视化函数,输出到examples目录
draw_layout_bbox(pdf_info, pdf_bytes, "examples", "output_with_layout.pdf")