126 lines
5.1 KiB
Python
126 lines
5.1 KiB
Python
"""PDF/UA generation."""
|
|
|
|
import pydyf
|
|
|
|
from .metadata import add_metadata
|
|
|
|
|
|
def pdfua(pdf, metadata, document, page_streams, attachments, compress):
|
|
"""Set metadata for PDF/UA documents."""
|
|
# Structure for PDF tagging
|
|
content_mapping = pydyf.Dictionary({})
|
|
pdf.add_object(content_mapping)
|
|
structure_root = pydyf.Dictionary({
|
|
'Type': '/StructTreeRoot',
|
|
'ParentTree': content_mapping.reference,
|
|
})
|
|
pdf.add_object(structure_root)
|
|
structure_document = pydyf.Dictionary({
|
|
'Type': '/StructElem',
|
|
'S': '/Document',
|
|
'P': structure_root.reference,
|
|
})
|
|
pdf.add_object(structure_document)
|
|
structure_root['K'] = pydyf.Array([structure_document.reference])
|
|
pdf.catalog['StructTreeRoot'] = structure_root.reference
|
|
|
|
document_children = []
|
|
content_mapping['Nums'] = pydyf.Array()
|
|
links = []
|
|
for page_number, page_stream in enumerate(page_streams):
|
|
structure = {}
|
|
document.build_element_structure(structure)
|
|
parents = [None] * len(page_stream.marked)
|
|
for mcid, (key, box) in enumerate(page_stream.marked):
|
|
# Build structure elements
|
|
kids = [mcid]
|
|
if key == 'Link':
|
|
object_reference = pydyf.Dictionary({
|
|
'Type': '/OBJR',
|
|
'Obj': box.link_annotation.reference,
|
|
'Pg': pdf.page_references[page_number],
|
|
})
|
|
pdf.add_object(object_reference)
|
|
links.append((object_reference.reference, box.link_annotation))
|
|
etree_element = box.element
|
|
child_structure_data_element = None
|
|
while True:
|
|
if etree_element is None:
|
|
structure_data = structure.setdefault(
|
|
box, {'parent': None})
|
|
else:
|
|
structure_data = structure[etree_element]
|
|
new_element = 'element' not in structure_data
|
|
if new_element:
|
|
child = structure_data['element'] = pydyf.Dictionary({
|
|
'Type': '/StructElem',
|
|
'S': f'/{key}',
|
|
'K': pydyf.Array(kids),
|
|
'Pg': pdf.page_references[page_number],
|
|
})
|
|
pdf.add_object(child)
|
|
if key == 'LI':
|
|
if etree_element.tag == 'dt':
|
|
sub_key = 'Lbl'
|
|
else:
|
|
sub_key = 'LBody'
|
|
real_child = pydyf.Dictionary({
|
|
'Type': '/StructElem',
|
|
'S': f'/{sub_key}',
|
|
'K': pydyf.Array(kids),
|
|
'Pg': pdf.page_references[page_number],
|
|
'P': child.reference,
|
|
})
|
|
pdf.add_object(real_child)
|
|
for kid in kids:
|
|
if isinstance(kid, int):
|
|
parents[kid] = real_child.reference
|
|
child['K'] = pydyf.Array([real_child.reference])
|
|
structure_data['element'] = real_child
|
|
else:
|
|
for kid in kids:
|
|
if isinstance(kid, int):
|
|
parents[kid] = child.reference
|
|
else:
|
|
child = structure_data['element']
|
|
child['K'].extend(kids)
|
|
for kid in kids:
|
|
if isinstance(kid, int):
|
|
parents[kid] = child.reference
|
|
kid = child.reference
|
|
if child_structure_data_element is not None:
|
|
child_structure_data_element['P'] = kid
|
|
if not new_element:
|
|
break
|
|
kids = [kid]
|
|
child_structure_data_element = child
|
|
if structure_data['parent'] is None:
|
|
child['P'] = structure_document.reference
|
|
document_children.append(child.reference)
|
|
break
|
|
else:
|
|
etree_element = structure_data['parent']
|
|
key = page_stream.get_marked_content_tag(etree_element.tag)
|
|
content_mapping['Nums'].append(page_number)
|
|
content_mapping['Nums'].append(pydyf.Array(parents))
|
|
structure_document['K'] = pydyf.Array(document_children)
|
|
for i, (link, annotation) in enumerate(links, start=page_number + 1):
|
|
content_mapping['Nums'].append(i)
|
|
content_mapping['Nums'].append(link)
|
|
annotation['StructParent'] = i
|
|
annotation['F'] = 2 ** (2 - 1)
|
|
|
|
# Common PDF metadata stream
|
|
add_metadata(pdf, metadata, 'ua', 1, conformance=None, compress=compress)
|
|
|
|
# PDF document extra metadata
|
|
if 'Lang' not in pdf.catalog:
|
|
pdf.catalog['Lang'] = pydyf.String()
|
|
pdf.catalog['ViewerPreferences'] = pydyf.Dictionary({
|
|
'DisplayDocTitle': 'true',
|
|
})
|
|
pdf.catalog['MarkInfo'] = pydyf.Dictionary({'Marked': 'true'})
|
|
|
|
|
|
VARIANTS = {'pdf/ua-1': (pdfua, {'mark': True})}
|