95 lines
3.5 KiB
Python
95 lines
3.5 KiB
Python
"""PDF/A generation."""
|
|
|
|
try:
|
|
# Available in Python 3.9+
|
|
from importlib.resources import files
|
|
except ImportError:
|
|
# Deprecated in Python 3.11+
|
|
from importlib.resources import read_binary
|
|
else:
|
|
def read_binary(package, resource):
|
|
return (files(package) / resource).read_bytes()
|
|
|
|
from functools import partial
|
|
|
|
import pydyf
|
|
|
|
from .metadata import add_metadata
|
|
|
|
|
|
def pdfa(pdf, metadata, document, page_streams, attachments, compress,
|
|
version):
|
|
"""Set metadata for PDF/A documents."""
|
|
# Add ICC profile.
|
|
profile = pydyf.Stream(
|
|
[read_binary(__package__, 'sRGB2014.icc')],
|
|
pydyf.Dictionary({'N': 3, 'Alternate': '/DeviceRGB'}),
|
|
compress=compress)
|
|
pdf.add_object(profile)
|
|
pdf.catalog['OutputIntents'] = pydyf.Array([
|
|
pydyf.Dictionary({
|
|
'Type': '/OutputIntent',
|
|
'S': '/GTS_PDFA1',
|
|
'OutputConditionIdentifier': pydyf.String('sRGB IEC61966-2.1'),
|
|
'DestOutputProfile': profile.reference,
|
|
}),
|
|
])
|
|
|
|
# Handle attachments.
|
|
if version == 1:
|
|
# Remove embedded files dictionary.
|
|
if 'Names' in pdf.catalog and 'EmbeddedFiles' in pdf.catalog['Names']:
|
|
del pdf.catalog['Names']['EmbeddedFiles']
|
|
if version <= 2:
|
|
# Remove attachments.
|
|
for pdf_object in pdf.objects:
|
|
if not isinstance(pdf_object, dict):
|
|
continue
|
|
if pdf_object.get('Type') != '/Filespec':
|
|
continue
|
|
reference = int(pdf_object['EF']['F'].split()[0])
|
|
stream = pdf.objects[reference]
|
|
# Remove all attachments for version 1.
|
|
# Remove non-PDF attachments for version 2.
|
|
# TODO: check that PDFs are actually PDF/A-2+ files.
|
|
if version == 1 or stream.extra['Subtype'] != '/application#2fpdf':
|
|
del pdf_object['EF']
|
|
if version >= 3:
|
|
# Add AF for attachments.
|
|
relationships = {
|
|
f'<{attachment.md5}>': attachment.relationship
|
|
for attachment in attachments if attachment.md5}
|
|
pdf_attachments = []
|
|
if 'Names' in pdf.catalog and 'EmbeddedFiles' in pdf.catalog['Names']:
|
|
reference = int(pdf.catalog['Names']['EmbeddedFiles'].split()[0])
|
|
names = pdf.objects[reference]
|
|
for name in names['Names'][1::2]:
|
|
pdf_attachments.append(name)
|
|
for pdf_object in pdf.objects:
|
|
if not isinstance(pdf_object, dict):
|
|
continue
|
|
if pdf_object.get('Type') != '/Filespec':
|
|
continue
|
|
reference = int(pdf_object['EF']['F'].split()[0])
|
|
checksum = pdf.objects[reference].extra['Params']['CheckSum']
|
|
relationship = relationships.get(checksum, 'Unspecified')
|
|
pdf_object['AFRelationship'] = f'/{relationship}'
|
|
pdf_attachments.append(pdf_object.reference)
|
|
if pdf_attachments:
|
|
if 'AF' not in pdf.catalog:
|
|
pdf.catalog['AF'] = pydyf.Array()
|
|
pdf.catalog['AF'].extend(pdf_attachments)
|
|
|
|
# Print annotations.
|
|
for pdf_object in pdf.objects:
|
|
if isinstance(pdf_object, dict) and pdf_object.get('Type') == '/Annot':
|
|
pdf_object['F'] = 2 ** (3 - 1)
|
|
|
|
# Common PDF metadata stream.
|
|
add_metadata(pdf, metadata, 'a', version, 'B', compress)
|
|
|
|
|
|
VARIANTS = {
|
|
f'pdf/a-{i}b': (partial(pdfa, version=i), {'version': pdf_version})
|
|
for i, pdf_version in enumerate(('1.4', '1.7', '1.7', '2.0'), start=1)}
|