408 lines
16 KiB
Python
408 lines
16 KiB
Python
"""Document generation management."""
|
||
|
||
import functools
|
||
import io
|
||
from hashlib import md5
|
||
from pathlib import Path
|
||
|
||
from . import CSS, DEFAULT_OPTIONS
|
||
from .anchors import gather_anchors, make_page_bookmark_tree
|
||
from .css import get_all_computed_styles
|
||
from .css.counters import CounterStyle
|
||
from .css.targets import TargetCollector
|
||
from .draw import draw_page, stacked
|
||
from .formatting_structure.build import build_formatting_structure
|
||
from .html import get_html_metadata
|
||
from .images import get_image_from_uri as original_get_image_from_uri
|
||
from .layout import LayoutContext, layout_document
|
||
from .logger import PROGRESS_LOGGER
|
||
from .matrix import Matrix
|
||
from .pdf import generate_pdf
|
||
from .text.fonts import FontConfiguration
|
||
|
||
|
||
class Page:
|
||
"""Represents a single rendered page.
|
||
|
||
Should be obtained from :attr:`Document.pages` but not
|
||
instantiated directly.
|
||
|
||
"""
|
||
def __init__(self, page_box):
|
||
#: The page width, including margins, in CSS pixels.
|
||
self.width = page_box.margin_width()
|
||
|
||
#: The page height, including margins, in CSS pixels.
|
||
self.height = page_box.margin_height()
|
||
|
||
#: The page bleed widths as a :obj:`dict` with ``'top'``, ``'right'``,
|
||
#: ``'bottom'`` and ``'left'`` as keys, and values in CSS pixels.
|
||
self.bleed = {
|
||
side: page_box.style[f'bleed_{side}'].value
|
||
for side in ('top', 'right', 'bottom', 'left')}
|
||
|
||
#: The :obj:`list` of ``(level, label, target, state)``
|
||
#: :obj:`tuples <tuple>`. ``level`` and ``label`` are respectively an
|
||
#: :obj:`int` and a :obj:`string <str>`, based on the CSS properties
|
||
#: of the same names. ``target`` is an ``(x, y)`` point in CSS pixels
|
||
#: from the top-left of the page.
|
||
self.bookmarks = []
|
||
|
||
#: The :obj:`list` of ``(link_type, target, rectangle, box)``
|
||
#: :obj:`tuples <tuple>`. A ``rectangle`` is ``(x, y, width, height)``,
|
||
#: in CSS pixels from the top-left of the page. ``link_type`` is one of
|
||
#: three strings:
|
||
#:
|
||
#: * ``'external'``: ``target`` is an absolute URL
|
||
#: * ``'internal'``: ``target`` is an anchor name (see
|
||
#: :attr:`Page.anchors`).
|
||
#: The anchor might be defined in another page,
|
||
#: in multiple pages (in which case the first occurence is used),
|
||
#: or not at all.
|
||
#: * ``'attachment'``: ``target`` is an absolute URL and points
|
||
#: to a resource to attach to the document.
|
||
self.links = []
|
||
|
||
#: The :obj:`dict` mapping each anchor name to its target, an
|
||
#: ``(x, y)`` point in CSS pixels from the top-left of the page.
|
||
self.anchors = {}
|
||
|
||
#: The :obj:`list` of ``(element, attributes, rectangle)`` :obj:`tuples
|
||
#: <tuple>`. A ``rectangle`` is ``(x, y, width, height)``, in CSS
|
||
#: pixels from the top-left of the page. ``atributes`` is a
|
||
#: :obj:`dict` of HTML tag attributes and values.
|
||
self.inputs = []
|
||
|
||
gather_anchors(
|
||
page_box, self.anchors, self.links, self.bookmarks, self.inputs)
|
||
self._page_box = page_box
|
||
|
||
def paint(self, stream, scale=1):
|
||
"""Paint the page into the PDF file.
|
||
|
||
:type stream: ``document.Stream``
|
||
:param stream:
|
||
A document stream.
|
||
:param float left_x:
|
||
X coordinate of the left of the page, in PDF points.
|
||
:param float top_y:
|
||
Y coordinate of the top of the page, in PDF points.
|
||
:param float scale:
|
||
Zoom scale.
|
||
:param bool clip:
|
||
Whether to clip/cut content outside the page. If false or
|
||
not provided, content can overflow.
|
||
|
||
"""
|
||
with stacked(stream):
|
||
stream.transform(a=scale, d=scale)
|
||
draw_page(self._page_box, stream)
|
||
|
||
|
||
class DocumentMetadata:
|
||
"""Meta-information belonging to a whole :class:`Document`.
|
||
|
||
New attributes may be added in future versions of WeasyPrint.
|
||
|
||
"""
|
||
def __init__(self, title=None, authors=None, description=None,
|
||
keywords=None, generator=None, created=None, modified=None,
|
||
attachments=None, lang=None, custom=None):
|
||
#: The title of the document, as a string or :obj:`None`.
|
||
#: Extracted from the ``<title>`` element in HTML
|
||
#: and written to the ``/Title`` info field in PDF.
|
||
self.title = title
|
||
#: The authors of the document, as a list of strings.
|
||
#: (Defaults to the empty list.)
|
||
#: Extracted from the ``<meta name=author>`` elements in HTML
|
||
#: and written to the ``/Author`` info field in PDF.
|
||
self.authors = authors or []
|
||
#: The description of the document, as a string or :obj:`None`.
|
||
#: Extracted from the ``<meta name=description>`` element in HTML
|
||
#: and written to the ``/Subject`` info field in PDF.
|
||
self.description = description
|
||
#: Keywords associated with the document, as a list of strings.
|
||
#: (Defaults to the empty list.)
|
||
#: Extracted from ``<meta name=keywords>`` elements in HTML
|
||
#: and written to the ``/Keywords`` info field in PDF.
|
||
self.keywords = keywords or []
|
||
#: The name of one of the software packages
|
||
#: used to generate the document, as a string or :obj:`None`.
|
||
#: Extracted from the ``<meta name=generator>`` element in HTML
|
||
#: and written to the ``/Creator`` info field in PDF.
|
||
self.generator = generator
|
||
#: The creation date of the document, as a string or :obj:`None`.
|
||
#: Dates are in one of the six formats specified in
|
||
#: `W3C’s profile of ISO 8601 <https://www.w3.org/TR/NOTE-datetime>`_.
|
||
#: Extracted from the ``<meta name=dcterms.created>`` element in HTML
|
||
#: and written to the ``/CreationDate`` info field in PDF.
|
||
self.created = created
|
||
#: The modification date of the document, as a string or :obj:`None`.
|
||
#: Dates are in one of the six formats specified in
|
||
#: `W3C’s profile of ISO 8601 <https://www.w3.org/TR/NOTE-datetime>`_.
|
||
#: Extracted from the ``<meta name=dcterms.modified>`` element in HTML
|
||
#: and written to the ``/ModDate`` info field in PDF.
|
||
self.modified = modified
|
||
#: A list of :class:`attachments <Attachment>`, empty by default.
|
||
#: Extracted from the ``<link rel=attachment>`` elements in HTML
|
||
#: and written to the ``/EmbeddedFiles`` dictionary in PDF.
|
||
self.attachments = attachments or []
|
||
#: Document language as BCP 47 language tags.
|
||
#: Extracted from ``<html lang=lang>`` in HTML.
|
||
self.lang = lang
|
||
#: Custom metadata, as a dict whose keys are the metadata names and
|
||
#: values are the metadata values.
|
||
self.custom = custom or {}
|
||
|
||
|
||
class DiskCache:
|
||
"""Dict-like storing images content on disk.
|
||
|
||
Bytestring values are stored on disk. Other lightweight Python objects
|
||
(i.e. RasterImage instances) are still stored in memory.
|
||
|
||
"""
|
||
def __init__(self, folder):
|
||
self._path = Path(folder)
|
||
self._path.mkdir(parents=True, exist_ok=True)
|
||
self._memory_cache = {}
|
||
self._disk_paths = set()
|
||
|
||
def _path_from_key(self, key):
|
||
return self._path / md5(key.encode()).hexdigest()
|
||
|
||
def __getitem__(self, key):
|
||
if key in self._memory_cache:
|
||
return self._memory_cache[key]
|
||
else:
|
||
return self._path_from_key(key).read_bytes()
|
||
|
||
def __setitem__(self, key, value):
|
||
if isinstance(value, bytes):
|
||
path = self._path_from_key(key)
|
||
self._disk_paths.add(path)
|
||
path.write_bytes(value)
|
||
else:
|
||
self._memory_cache[key] = value
|
||
|
||
def __contains__(self, key):
|
||
return (
|
||
key in self._memory_cache or
|
||
self._path_from_key(key).exists())
|
||
|
||
def __del__(self):
|
||
try:
|
||
for path in self._disk_paths:
|
||
path.unlink(missing_ok=True)
|
||
self._path.rmdir()
|
||
except Exception:
|
||
# Silently ignore errors while clearing cache
|
||
pass
|
||
|
||
|
||
class Document:
|
||
"""A rendered document ready to be painted in a pydyf stream.
|
||
|
||
Typically obtained from :meth:`HTML.render() <weasyprint.HTML.render>`, but
|
||
can also be instantiated directly with a list of :class:`pages <Page>`, a
|
||
set of :class:`metadata <DocumentMetadata>`, a :func:`url_fetcher
|
||
<weasyprint.default_url_fetcher>` function, and a :class:`font_config
|
||
<weasyprint.text.fonts.FontConfiguration>`.
|
||
|
||
"""
|
||
|
||
@classmethod
|
||
def _build_layout_context(cls, html, font_config, counter_style, options):
|
||
target_collector = TargetCollector()
|
||
page_rules = []
|
||
user_stylesheets = []
|
||
cache = options['cache']
|
||
if cache is None:
|
||
cache = {}
|
||
elif not isinstance(cache, (dict, DiskCache)):
|
||
cache = DiskCache(cache)
|
||
for css in options['stylesheets'] or []:
|
||
if not hasattr(css, 'matcher'):
|
||
css = CSS(
|
||
guess=css, media_type=html.media_type,
|
||
font_config=font_config, counter_style=counter_style)
|
||
user_stylesheets.append(css)
|
||
style_for = get_all_computed_styles(
|
||
html, user_stylesheets, options['presentational_hints'],
|
||
font_config, counter_style, page_rules, target_collector,
|
||
options['pdf_forms'])
|
||
get_image_from_uri = functools.partial(
|
||
original_get_image_from_uri, cache=cache,
|
||
url_fetcher=html.url_fetcher, options=options)
|
||
PROGRESS_LOGGER.info('Step 4 - Creating formatting structure')
|
||
context = LayoutContext(
|
||
style_for, get_image_from_uri, font_config, counter_style,
|
||
target_collector)
|
||
return context
|
||
|
||
@classmethod
|
||
def _render(cls, html, font_config, counter_style, options):
|
||
if font_config is None:
|
||
font_config = FontConfiguration()
|
||
|
||
if counter_style is None:
|
||
counter_style = CounterStyle()
|
||
|
||
context = cls._build_layout_context(
|
||
html, font_config, counter_style, options)
|
||
|
||
root_box = build_formatting_structure(
|
||
html.etree_element, context.style_for, context.get_image_from_uri,
|
||
html.base_url, context.target_collector, counter_style,
|
||
context.footnotes)
|
||
|
||
page_boxes = layout_document(html, root_box, context)
|
||
rendering = cls(
|
||
[Page(page_box) for page_box in page_boxes],
|
||
DocumentMetadata(**get_html_metadata(html)),
|
||
html.url_fetcher, font_config)
|
||
rendering._html = html
|
||
return rendering
|
||
|
||
def __init__(self, pages, metadata, url_fetcher, font_config):
|
||
#: A list of :class:`Page` objects.
|
||
self.pages = pages
|
||
#: A :class:`DocumentMetadata` object.
|
||
#: Contains information that does not belong to a specific page
|
||
#: but to the whole document.
|
||
self.metadata = metadata
|
||
#: A function or other callable with the same signature as
|
||
#: :func:`weasyprint.default_url_fetcher` called to fetch external
|
||
#: resources such as stylesheets and images. (See :ref:`URL Fetchers`.)
|
||
self.url_fetcher = url_fetcher
|
||
#: A :obj:`dict` of fonts used by the document. Keys are hashes used to
|
||
#: identify fonts, values are ``Font`` objects.
|
||
self.fonts = {}
|
||
|
||
# Keep a reference to font_config to avoid its garbage collection until
|
||
# rendering is destroyed. This is needed as font_config.__del__ removes
|
||
# fonts that may be used when rendering
|
||
self.font_config = font_config
|
||
|
||
def build_element_structure(self, structure, etree_element=None):
|
||
if etree_element is None:
|
||
etree_element = self._html.etree_element
|
||
structure[etree_element] = {'parent': None}
|
||
for child in etree_element:
|
||
structure[child] = {'parent': etree_element}
|
||
self.build_element_structure(structure, child)
|
||
|
||
def copy(self, pages='all'):
|
||
"""Take a subset of the pages.
|
||
|
||
:type pages: :term:`iterable`
|
||
:param pages:
|
||
An iterable of :class:`Page` objects from :attr:`pages`.
|
||
:return:
|
||
A new :class:`Document` object.
|
||
|
||
Examples:
|
||
|
||
Write two PDF files for odd-numbered and even-numbered pages::
|
||
|
||
# Python lists count from 0 but pages are numbered from 1.
|
||
# [::2] is a slice of even list indexes but odd-numbered pages.
|
||
document.copy(document.pages[::2]).write_pdf('odd_pages.pdf')
|
||
document.copy(document.pages[1::2]).write_pdf('even_pages.pdf')
|
||
|
||
Combine multiple documents into one PDF file,
|
||
using metadata from the first::
|
||
|
||
all_pages = [p for doc in documents for p in doc.pages]
|
||
documents[0].copy(all_pages).write_pdf('combined.pdf')
|
||
|
||
"""
|
||
if pages == 'all':
|
||
pages = self.pages
|
||
elif not isinstance(pages, list):
|
||
pages = list(pages)
|
||
return type(self)(
|
||
pages, self.metadata, self.url_fetcher, self.font_config)
|
||
|
||
def make_bookmark_tree(self, scale=1, transform_pages=False):
|
||
"""Make a tree of all bookmarks in the document.
|
||
|
||
:param float scale:
|
||
Zoom scale.
|
||
:param bool transform_pages:
|
||
A boolean defining whether the default PDF page transformation
|
||
matrix has to be applied to bookmark coordinates, setting the
|
||
bottom-left corner as the origin.
|
||
:return: A list of bookmark subtrees.
|
||
A subtree is ``(label, target, children, state)``. ``label`` is
|
||
a string, ``target`` is ``(page_number, x, y)`` and ``children``
|
||
is a list of child subtrees.
|
||
|
||
"""
|
||
root = []
|
||
# At one point in the document, for each "output" depth, how much
|
||
# to add to get the source level (CSS values of bookmark-level).
|
||
# E.g. with <h1> then <h3>, level_shifts == [0, 1]
|
||
# 1 means that <h3> has depth 3 - 1 = 2 in the output.
|
||
skipped_levels = []
|
||
last_by_depth = [root]
|
||
previous_level = 0
|
||
for page_number, page in enumerate(self.pages):
|
||
if transform_pages:
|
||
matrix = Matrix(a=scale, d=-scale, f=page.height * scale)
|
||
else:
|
||
matrix = Matrix(a=scale, d=scale)
|
||
previous_level = make_page_bookmark_tree(
|
||
page, skipped_levels, last_by_depth, previous_level,
|
||
page_number, matrix)
|
||
return root
|
||
|
||
def write_pdf(self, target=None, zoom=1, finisher=None, **options):
|
||
"""Paint the pages in a PDF file, with metadata.
|
||
|
||
:type target:
|
||
:class:`str`, :class:`pathlib.Path` or :term:`file object`
|
||
:param target:
|
||
A filename where the PDF file is generated, a file object, or
|
||
:obj:`None`.
|
||
:param float zoom:
|
||
The zoom factor in PDF units per CSS units. **Warning**:
|
||
All CSS units are affected, including physical units like
|
||
``cm`` and named sizes like ``A4``. For values other than
|
||
1, the physical CSS units will thus be "wrong".
|
||
:type finisher: :term:`callable`
|
||
:param finisher:
|
||
A finisher function or callable that accepts the document and a
|
||
:class:`pydyf.PDF` object as parameters. Can be passed to perform
|
||
post-processing on the PDF right before the trailer is written.
|
||
:param options:
|
||
The ``options`` parameter includes by default the
|
||
:data:`weasyprint.DEFAULT_OPTIONS` values.
|
||
:returns:
|
||
The PDF as :obj:`bytes` if ``target`` is not provided or
|
||
:obj:`None`, otherwise :obj:`None` (the PDF is written to
|
||
``target``).
|
||
|
||
"""
|
||
new_options = DEFAULT_OPTIONS.copy()
|
||
new_options.update(options)
|
||
options = new_options
|
||
pdf = generate_pdf(self, target, zoom, **options)
|
||
|
||
identifier = options['pdf_identifier']
|
||
compress = not options['uncompressed_pdf']
|
||
|
||
if finisher:
|
||
finisher(self, pdf)
|
||
|
||
if target is None:
|
||
output = io.BytesIO()
|
||
pdf.write(output, pdf.version, identifier, compress)
|
||
return output.getvalue()
|
||
|
||
if hasattr(target, 'write'):
|
||
pdf.write(target, pdf.version, identifier, compress)
|
||
else:
|
||
with open(target, 'wb') as fd:
|
||
pdf.write(fd, pdf.version, identifier, compress)
|