Source code for wikitextparser._tag

"""Define the Tag class and tag-related regular expressions.

Unlike MediaWiki which has very strict HTML rules, regexes
defined in this module don't follow those restrictions and allow finding
most HTML tags.

For more info see:
* https://www.mediawiki.org/wiki/HTML_restriction
"""

from typing import Any, Dict, List, Optional, Tuple

from regex import DOTALL, VERBOSE

from ._spans import ATTRS_PATTERN, END_TAG_PATTERN, SPACE_CHARS
from ._wikitext import SubWikiText, rc

# HTML elements all have names that only use alphanumeric ASCII characters
# https://www.w3.org/TR/html5/syntax.html#syntax-tag-name
# Todo: can the tags method be implemented using a TAG_FINDITER? Will
#   that be more performant?
# TAG_FINDITER should not find any tag containing other tags.
# TAG_CONTENTS = r'(?<contents>(?>(?!{TAG}).)*?)'.format(
#     TAG=TAG.format(**locals())
# )
# TAG_FINDITER = rc(
#     TAG.format(**locals()), flags=DOTALL | VERBOSE
# ).finditer
# Note that the following regex won't check for nested tags
TAG_FULLMATCH = rc(
    rb"""
    <(?<name>[A-Za-z0-9]++)"""
    + ATTRS_PATTERN
    + rb"""
    ["""
    + SPACE_CHARS
    + rb"""]*+
    (?>
        >(?<contents>.*)"""
    + END_TAG_PATTERN.replace(rb'{name}', rb'(?<end_name>[A-Za-z0-9]++)')
    + rb"""|>  # only start; no end tag; could be self-closing
    )""",
    DOTALL | VERBOSE,
).fullmatch


[docs] class SubWikiTextWithAttrs(SubWikiText): """Define a class for SubWikiText objects that have attributes. Any class that is going to inherit from SubWikiTextWithAttrs should provide _attrs_match property. Note that matching should be done on shadow. It's usually a good idea to cache the _attrs_match property. """ __slots__ = '_attrs_match' @property def attrs(self) -> Dict[str, str]: """Return self attributes as a dictionary.""" spans = self._attrs_match.spans string = self.string return dict( zip( (string[s:e] for s, e in spans('attr_name')), (string[s:e] for s, e in spans('attr_value')), ) )
[docs] def has_attr(self, attr_name: str) -> bool: """Return True if self contains an attribute with the given name.""" string = self.string return attr_name in ( string[s:e] for s, e in self._attrs_match.spans('attr_name') )
[docs] def get_attr(self, attr_name: str) -> Optional[str]: """Return the value of the last attribute with the given name. Return None if the attr_name does not exist in self. If there are already multiple attributes with the given name, only return the value of the last one. Return an empty string if the mentioned name is an empty attribute. """ spans = self._attrs_match.spans string = self.string for i, (s, e) in enumerate(reversed(spans('attr_name'))): if string[s:e] == attr_name: s, e = spans('attr_value')[-i - 1] return string[s:e] return None
[docs] def set_attr(self, attr_name: str, attr_value: str) -> None: """Set the value for the given attribute name. If there are already multiple attributes with the given name, only set the value for the last one. If attr_value == '', use the implicit empty attribute syntax. """ match = self._attrs_match string = self.string for i, (s, e) in enumerate(reversed(match.spans('attr_name'))): if string[s:e] == attr_name: vs, ve = match.spans('attr_value')[-i - 1] q = 1 if match.string[ve] in b'"\'' else 0 self[vs - q : ve + q] = f'"{attr_value}"' return # The attr_name is new, add a new attribute. self.insert( match.end('attr_insert'), f' {attr_name}="{attr_value}"' if attr_value else f' {attr_name}', ) return
[docs] def del_attr(self, attr_name: str) -> None: """Delete all the attributes with the given name. Pass if the attr_name is not found in self. """ match = self._attrs_match string = self.string # Must be done in reversed order because the spans # change after each deletion. for i, (s, e) in enumerate(reversed(match.spans('attr_name'))): if string[s:e] == attr_name: start, stop = match.spans('attr')[-i - 1] del self[start:stop]
[docs] class Tag(SubWikiTextWithAttrs): __slots__ = '_match_cache'
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._match_cache = None, None
@property def _match(self) -> Any: """Return the match object for the current tag. Cache the result.""" cached_match, cached_string = self._match_cache string = self.string if cached_string == string: return cached_match match = TAG_FULLMATCH(self._shadow) self._match_cache = match, string return match _attrs_match = _match @property def name(self) -> str: """Tag's name. Support both get and set operations.""" return self._match['name'].decode() @name.setter def name(self, name: str) -> None: # The name in the end tag should be replaced first because the spans # of the match object change after each replacement. span = self._match.span start, end = span('end_name') if start != -1: self[start:end] = name start, end = span('name') self[start:end] = name @property def contents(self) -> Optional[str]: """Tag contents. Support both get and set operations. setter: Set contents to a new value. Note that if the tag is self-closing, then it will be expanded to have a start tag and an end tag. For example: >>> t = Tag('<t/>') >>> t.contents = 'n' >>> t.string '<t>n</t>' """ s, e = self._match.span('contents') return self(s, e) @contents.setter def contents(self, contents: str) -> None: match = self._match start, end = match.span('contents') if start != -1: self[start:end] = contents else: # This is a self-closing tag. self[-1:] = f'>{contents}</{match["name"].decode()}>' @property def parsed_contents(self) -> SubWikiText: """Return the contents as a SubWikiText object.""" ss, _, _, byte_array = self._span_data s, e = self._match.span('contents') tts = self._type_to_spans spans = tts.setdefault('SubWikiText', []) ps, pe = span_tuple = ss + s, ss + e try: i = [(s[0], s[1]) for s in spans].index(span_tuple) except ValueError: span = [ps, pe, None, byte_array[s:e]] spans.append(span) spans.sort() else: span = spans[i] return SubWikiText(self._lststr, tts, span, 'SubWikiText') @property def _extension_tags(self): return super()._extension_tags[1:]
[docs] def get_tags(self, name=None) -> List['Tag']: return super().get_tags(name)[1:]
@property def _content_span(self) -> Tuple[int, int]: s = self.string return s.find('>') + 1, s.rfind('<')