"""Specific handling for some HTML elements, especially replaced elements.
Replaced elements (eg. elements) are rendered externally and behave as an
atomic opaque box in CSS. In general, they may or may not have intrinsic
dimensions. But the only replaced elements currently supported in WeasyPrint
are images with intrinsic dimensions.
"""
try:
# Available in Python 3.9+
from importlib.resources import files
except ImportError: # pragma: no cover
# Deprecated in Python 3.11+
from importlib.resources import read_text
else:
def read_text(package, resource):
return (files(package) / resource).read_text('utf-8')
import re
from . import CSS, Attachment, css
from .css import get_child_text
from .css.counters import CounterStyle
from .formatting_structure import boxes
from .images import SVGImage
from .logger import LOGGER
from .urls import get_url_attribute
HTML5_UA_COUNTER_STYLE = CounterStyle()
HTML5_UA = read_text(css, 'html5_ua.css')
HTML5_UA_FORM = read_text(css, 'html5_ua_form.css')
HTML5_PH = read_text(css, 'html5_ph.css')
HTML5_UA_STYLESHEET = CSS(
string=HTML5_UA, counter_style=HTML5_UA_COUNTER_STYLE)
HTML5_UA_FORM_STYLESHEET = CSS(
string=HTML5_UA_FORM, counter_style=HTML5_UA_COUNTER_STYLE)
HTML5_PH_STYLESHEET = CSS(string=HTML5_PH)
# https://html.spec.whatwg.org/multipage/#space-character
HTML_WHITESPACE = ' \t\n\f\r'
HTML_SPACE_SEPARATED_TOKENS_RE = re.compile(f'[^{HTML_WHITESPACE}]+')
def ascii_lower(string):
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
This is used for `ASCII case-insensitive
`_ matching.
This is different from the :meth:`str.lower` method of Unicode strings
which also affect non-ASCII characters,
sometimes mapping them into the ASCII range:
>>> keyword = 'Bac\N{KELVIN SIGN}ground'
>>> assert keyword.lower() == 'background'
>>> assert ascii_lower(keyword) != keyword.lower()
>>> assert ascii_lower(keyword) == 'bac\N{KELVIN SIGN}ground'
"""
# This turns out to be faster than unicode.translate()
return string.encode().lower().decode()
def element_has_link_type(element, link_type):
"""Return whether element has a ``rel`` attribute with given link type."""
tokens = HTML_SPACE_SEPARATED_TOKENS_RE.findall(element.get('rel', ''))
return any(ascii_lower(token) == link_type for token in tokens)
# Maps HTML tag names to function taking an HTML element and returning a Box.
HTML_HANDLERS = {}
def handle_element(element, box, get_image_from_uri, base_url):
"""Handle HTML elements that need special care.
:returns: a (possibly empty) list of boxes.
"""
if box.element_tag in HTML_HANDLERS:
return HTML_HANDLERS[element.tag](
element, box, get_image_from_uri, base_url)
else:
return [box]
def handler(tag):
"""Return a decorator registering a function handling ``tag`` elements."""
def decorator(function):
"""Decorator registering a function handling ``tag`` elements."""
HTML_HANDLERS[tag] = function
return function
return decorator
def make_replaced_box(element, box, image):
"""Wrap an image in a replaced box.
That box is either block-level or inline-level, depending on what the
element should be.
"""
type_ = (
boxes.BlockReplacedBox if 'block' in box.style['display']
else boxes.InlineReplacedBox)
new_box = type_(element.tag, box.style, element, image)
# TODO: check other attributes that need to be copied
# TODO: find another solution
new_box.string_set = box.string_set
new_box.bookmark_label = box.bookmark_label
return new_box
@handler('img')
def handle_img(element, box, get_image_from_uri, base_url):
"""Handle ```` elements.
Return either an image or the alt-text.
See: https://www.w3.org/TR/html5/embedded-content-1.html#the-img-element
"""
src = get_url_attribute(element, 'src', base_url)
alt = element.get('alt')
if src:
image = get_image_from_uri(
url=src, orientation=box.style['image_orientation'])
if image is not None:
return [make_replaced_box(element, box, image)]
else:
# Invalid image, use the alt-text.
if alt:
box.children = [boxes.TextBox.anonymous_from(box, alt)]
return [box]
elif alt == '':
# The element represents nothing
return []
else:
assert alt is None
# TODO: find some indicator that an image is missing.
# For now, just remove the image.
return []
else:
if alt:
box.children = [boxes.TextBox.anonymous_from(box, alt)]
return [box]
else:
return []
@handler('embed')
def handle_embed(element, box, get_image_from_uri, base_url):
"""Handle ``