from functools import cached_property from warnings import warn from webencodings import ascii_lower from .compiler import compile_selector_list, split_whitespace class ElementWrapper: """Wrapper of :class:`xml.etree.ElementTree.Element` for Selector matching. This class should not be instanciated directly. :meth:`from_xml_root` or :meth:`from_html_root` should be used for the root element of a document, and other elements should be accessed (and wrappers generated) using methods such as :meth:`iter_children` and :meth:`iter_subtree`. :class:`ElementWrapper` objects compare equal if their underlying :class:`xml.etree.ElementTree.Element` do. """ @classmethod def from_xml_root(cls, root, content_language=None): """Wrap for selector matching the root of an XML or XHTML document. :param root: An ElementTree :class:`xml.etree.ElementTree.Element` for the root element of a document. If the given element is not the root, selector matching will behave is if it were. In other words, selectors will be not be `scoped`_ to the subtree rooted at that element. :returns: A new :class:`ElementWrapper` .. _scoped: https://drafts.csswg.org/selectors-4/#scoping """ return cls._from_root(root, content_language, in_html_document=False) @classmethod def from_html_root(cls, root, content_language=None): """Same as :meth:`from_xml_root` with case-insensitive attribute names. Useful for documents parsed with an HTML parser like html5lib, which should be the case of documents with the ``text/html`` MIME type. """ return cls._from_root(root, content_language, in_html_document=True) @classmethod def _from_root(cls, root, content_language, in_html_document=True): if hasattr(root, 'getroot'): root = root.getroot() return cls( root, parent=None, index=0, previous=None, in_html_document=in_html_document, content_language=content_language) def __init__(self, etree_element, parent, index, previous, in_html_document, content_language=None): #: The underlying ElementTree :class:`xml.etree.ElementTree.Element` self.etree_element = etree_element #: The parent :class:`ElementWrapper`, #: or :obj:`None` for the root element. self.parent = parent #: The previous sibling :class:`ElementWrapper`, #: or :obj:`None` for the root element. self.previous = previous if parent is not None: #: The :attr:`parent`’s children #: as a list of #: ElementTree :class:`xml.etree.ElementTree.Element`\ s. #: For the root (which has no parent) self.etree_siblings = parent.etree_children else: self.etree_siblings = [etree_element] #: The position within the :attr:`parent`’s children, counting from 0. #: ``e.etree_siblings[e.index]`` is always ``e.etree_element``. self.index = index self.in_html_document = in_html_document self.transport_content_language = content_language # Cache self._ancestors = None self._previous_siblings = None def __eq__(self, other): return ( type(self) is type(other) and self.etree_element == other.etree_element) def __ne__(self, other): return not (self == other) def __hash__(self): return hash((type(self), self.etree_element)) def __iter__(self): yield from self.iter_children() @property def ancestors(self): """Tuple of existing ancestors. Tuple of existing :class:`ElementWrapper` objects for this element’s ancestors, in reversed tree order, from :attr:`parent` to the root. """ if self._ancestors is None: self._ancestors = ( () if self.parent is None else (*self.parent.ancestors, self.parent)) return self._ancestors @property def previous_siblings(self): """Tuple of previous siblings. Tuple of existing :class:`ElementWrapper` objects for this element’s previous siblings, in reversed tree order. """ if self._previous_siblings is None: self._previous_siblings = ( () if self.previous is None else (*self.previous.previous_siblings, self.previous)) return self._previous_siblings def iter_ancestors(self): """Iterate over ancestors. Return an iterator of existing :class:`ElementWrapper` objects for this element’s ancestors, in reversed tree order (from :attr:`parent` to the root). The element itself is not included, this is an empty sequence for the root element. This method is deprecated and will be removed in version 0.7.0. Use :attr:`ancestors` instead. """ warn( 'This method is deprecated and will be removed in version 0.7.0. ' 'Use the "ancestors" attribute instead.', DeprecationWarning) yield from self.ancestors def iter_previous_siblings(self): """Iterate over previous siblings. Return an iterator of existing :class:`ElementWrapper` objects for this element’s previous siblings, in reversed tree order. The element itself is not included, this is an empty sequence for a first child or the root element. This method is deprecated and will be removed in version 0.7.0. Use :attr:`previous_siblings` instead. """ warn( 'This method is deprecated and will be removed in version 0.7.0. ' 'Use the "previous_siblings" attribute instead.', DeprecationWarning) yield from self.previous_siblings def iter_siblings(self): """Iterate over siblings. Return an iterator of newly-created :class:`ElementWrapper` objects for this element’s siblings, in tree order. """ if self.parent is None: yield self else: yield from self.parent.iter_children() def iter_next_siblings(self): """Iterate over next siblings. Return an iterator of newly-created :class:`ElementWrapper` objects for this element’s next siblings, in tree order. """ found = False for sibling in self.iter_siblings(): if found: yield sibling if sibling == self: found = True def iter_children(self): """Iterate over children. Return an iterator of newly-created :class:`ElementWrapper` objects for this element’s child elements, in tree order. """ child = None for i, etree_child in enumerate(self.etree_children): child = type(self)( etree_child, parent=self, index=i, previous=child, in_html_document=self.in_html_document) yield child def iter_subtree(self): """Iterate over subtree. Return an iterator of newly-created :class:`ElementWrapper` objects for the entire subtree rooted at this element, in tree order. Unlike in other methods, the element itself *is* included. This loops over an entire document: .. code-block:: python for element in ElementWrapper.from_root(root_etree).iter_subtree(): ... """ stack = [iter([self])] while stack: element = next(stack[-1], None) if element is None: stack.pop() else: yield element stack.append(element.iter_children()) @staticmethod def _compile(selectors): return [ compiled_selector.test for selector in selectors for compiled_selector in ( [selector] if hasattr(selector, 'test') else compile_selector_list(selector)) if compiled_selector.pseudo_element is None and not compiled_selector.never_matches] def matches(self, *selectors): """Return wether this elememt matches any of the given selectors. :param selectors: Each given selector is either a :class:`compiler.CompiledSelector`, or an argument to :func:`compile_selector_list`. """ return any(test(self) for test in self._compile(selectors)) def query_all(self, *selectors): """Return elements, in tree order, that match any of given selectors. Selectors are `scoped`_ to the subtree rooted at this element. .. _scoped: https://drafts.csswg.org/selectors-4/#scoping :param selectors: Each given selector is either a :class:`compiler.CompiledSelector`, or an argument to :func:`compile_selector_list`. :returns: An iterator of newly-created :class:`ElementWrapper` objects. """ tests = self._compile(selectors) if len(tests) == 1: return filter(tests[0], self.iter_subtree()) elif selectors: return ( element for element in self.iter_subtree() if any(test(element) for test in tests)) else: return iter(()) def query(self, *selectors): """Return first element that matches any of given selectors. :param selectors: Each given selector is either a :class:`compiler.CompiledSelector`, or an argument to :func:`compile_selector_list`. :returns: A newly-created :class:`ElementWrapper` object, or :obj:`None` if there is no match. """ return next(self.query_all(*selectors), None) @cached_property def etree_children(self): """Children as a list of :class:`xml.etree.ElementTree.Element`. Other ElementTree nodes such as :func:`comments ` and :func:`processing instructions ` are not included. """ return [ element for element in self.etree_element if isinstance(element.tag, str)] @cached_property def local_name(self): """The local name of this element, as a string.""" namespace_url, local_name = _split_etree_tag(self.etree_element.tag) self.__dict__['namespace_url'] = namespace_url return local_name @cached_property def namespace_url(self): """The namespace URL of this element, as a string.""" namespace_url, local_name = _split_etree_tag(self.etree_element.tag) self.__dict__['local_name'] = local_name return namespace_url @cached_property def id(self): """The ID of this element, as a string.""" return self.etree_element.get('id') @cached_property def classes(self): """The classes of this element, as a :class:`set` of strings.""" return set(split_whitespace(self.etree_element.get('class', ''))) @cached_property def lang(self): """The language of this element, as a string.""" # http://whatwg.org/C#language xml_lang = self.etree_element.get('{http://www.w3.org/XML/1998/namespace}lang') if xml_lang is not None: return ascii_lower(xml_lang) is_html = ( self.in_html_document or self.namespace_url == 'http://www.w3.org/1999/xhtml') if is_html: lang = self.etree_element.get('lang') if lang is not None: return ascii_lower(lang) if self.parent is not None: return self.parent.lang # Root elememnt if is_html: content_language = None iterator = self.etree_element.iter('{http://www.w3.org/1999/xhtml}meta') for meta in iterator: http_equiv = meta.get('http-equiv', '') if ascii_lower(http_equiv) == 'content-language': content_language = _parse_content_language(meta.get('content')) if content_language is not None: return ascii_lower(content_language) # Empty string means unknown return _parse_content_language(self.transport_content_language) or '' @cached_property def in_disabled_fieldset(self): if self.parent is None: return False fieldset = '{http://www.w3.org/1999/xhtml}fieldset' legend = '{http://www.w3.org/1999/xhtml}legend' disabled_fieldset = ( self.parent.etree_element.tag == fieldset and self.parent.etree_element.get('disabled') is not None and ( self.etree_element.tag != legend or any( sibling.etree_element.tag == legend for sibling in self.iter_previous_siblings()))) return disabled_fieldset or self.parent.in_disabled_fieldset def _split_etree_tag(tag): position = tag.rfind('}') if position == -1 or tag[0] != '{': return '', tag else: return tag[1:position], tag[position+1:] def _parse_content_language(value): if value is not None and ',' not in value: parts = split_whitespace(value) if len(parts) == 1: return parts[0]