"""Define the Tag class and tag-related regular expressions.
Unlike MediaWiki which has very strict HTML rules, regexes
defined in this module don't follow those restrictions and allow finding
most HTML tags.
For more info see:
* https://www.mediawiki.org/wiki/HTML_restriction
"""
from typing import Any, Dict, List, Optional, Tuple
from regex import DOTALL, VERBOSE
from ._spans import ATTRS_PATTERN, END_TAG_PATTERN, SPACE_CHARS
from ._wikitext import SubWikiText, rc
# HTML elements all have names that only use alphanumeric ASCII characters
# https://www.w3.org/TR/html5/syntax.html#syntax-tag-name
# Todo: can the tags method be implemented using a TAG_FINDITER? Will
# that be more performant?
# TAG_FINDITER should not find any tag containing other tags.
# TAG_CONTENTS = r'(?<contents>(?>(?!{TAG}).)*?)'.format(
# TAG=TAG.format(**locals())
# )
# TAG_FINDITER = rc(
# TAG.format(**locals()), flags=DOTALL | VERBOSE
# ).finditer
# Note that the following regex won't check for nested tags
TAG_FULLMATCH = rc(
rb"""
<(?<name>[A-Za-z0-9]++)"""
+ ATTRS_PATTERN
+ rb"""
["""
+ SPACE_CHARS
+ rb"""]*+
(?>
>(?<contents>.*)"""
+ END_TAG_PATTERN.replace(rb'{name}', rb'(?<end_name>[A-Za-z0-9]++)')
+ rb"""|> # only start; no end tag; could be self-closing
)""",
DOTALL | VERBOSE,
).fullmatch
[docs]
class SubWikiTextWithAttrs(SubWikiText):
"""Define a class for SubWikiText objects that have attributes.
Any class that is going to inherit from SubWikiTextWithAttrs should provide
_attrs_match property. Note that matching should be done on shadow.
It's usually a good idea to cache the _attrs_match property.
"""
__slots__ = '_attrs_match'
@property
def attrs(self) -> Dict[str, str]:
"""Return self attributes as a dictionary."""
spans = self._attrs_match.spans
string = self.string
return dict(
zip(
(string[s:e] for s, e in spans('attr_name')),
(string[s:e] for s, e in spans('attr_value')),
)
)
[docs]
def has_attr(self, attr_name: str) -> bool:
"""Return True if self contains an attribute with the given name."""
string = self.string
return attr_name in (
string[s:e] for s, e in self._attrs_match.spans('attr_name')
)
[docs]
def get_attr(self, attr_name: str) -> Optional[str]:
"""Return the value of the last attribute with the given name.
Return None if the attr_name does not exist in self.
If there are already multiple attributes with the given name, only
return the value of the last one.
Return an empty string if the mentioned name is an empty attribute.
"""
spans = self._attrs_match.spans
string = self.string
for i, (s, e) in enumerate(reversed(spans('attr_name'))):
if string[s:e] == attr_name:
s, e = spans('attr_value')[-i - 1]
return string[s:e]
return None
[docs]
def set_attr(self, attr_name: str, attr_value: str) -> None:
"""Set the value for the given attribute name.
If there are already multiple attributes with the given name, only
set the value for the last one.
If attr_value == '', use the implicit empty attribute syntax.
"""
match = self._attrs_match
string = self.string
for i, (s, e) in enumerate(reversed(match.spans('attr_name'))):
if string[s:e] == attr_name:
vs, ve = match.spans('attr_value')[-i - 1]
q = 1 if match.string[ve] in b'"\'' else 0
self[vs - q : ve + q] = f'"{attr_value}"'
return
# The attr_name is new, add a new attribute.
self.insert(
match.end('attr_insert'),
f' {attr_name}="{attr_value}"' if attr_value else f' {attr_name}',
)
return
[docs]
def del_attr(self, attr_name: str) -> None:
"""Delete all the attributes with the given name.
Pass if the attr_name is not found in self.
"""
match = self._attrs_match
string = self.string
# Must be done in reversed order because the spans
# change after each deletion.
for i, (s, e) in enumerate(reversed(match.spans('attr_name'))):
if string[s:e] == attr_name:
start, stop = match.spans('attr')[-i - 1]
del self[start:stop]
[docs]
class Tag(SubWikiTextWithAttrs):
__slots__ = '_match_cache'
[docs]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._match_cache = None, None
@property
def _match(self) -> Any:
"""Return the match object for the current tag. Cache the result."""
cached_match, cached_string = self._match_cache
string = self.string
if cached_string == string:
return cached_match
match = TAG_FULLMATCH(self._shadow)
self._match_cache = match, string
return match
_attrs_match = _match
@property
def name(self) -> str:
"""Tag's name. Support both get and set operations."""
return self._match['name'].decode()
@name.setter
def name(self, name: str) -> None:
# The name in the end tag should be replaced first because the spans
# of the match object change after each replacement.
span = self._match.span
start, end = span('end_name')
if start != -1:
self[start:end] = name
start, end = span('name')
self[start:end] = name
@property
def contents(self) -> Optional[str]:
"""Tag contents. Support both get and set operations.
setter:
Set contents to a new value.
Note that if the tag is self-closing, then it will be expanded to
have a start tag and an end tag. For example:
>>> t = Tag('<t/>')
>>> t.contents = 'n'
>>> t.string
'<t>n</t>'
"""
s, e = self._match.span('contents')
return self(s, e)
@contents.setter
def contents(self, contents: str) -> None:
match = self._match
start, end = match.span('contents')
if start != -1:
self[start:end] = contents
else:
# This is a self-closing tag.
self[-1:] = f'>{contents}</{match["name"].decode()}>'
@property
def parsed_contents(self) -> SubWikiText:
"""Return the contents as a SubWikiText object."""
ss, _, _, byte_array = self._span_data
s, e = self._match.span('contents')
tts = self._type_to_spans
spans = tts.setdefault('SubWikiText', [])
ps, pe = span_tuple = ss + s, ss + e
try:
i = [(s[0], s[1]) for s in spans].index(span_tuple)
except ValueError:
span = [ps, pe, None, byte_array[s:e]]
spans.append(span)
spans.sort()
else:
span = spans[i]
return SubWikiText(self._lststr, tts, span, 'SubWikiText')
@property
def _extension_tags(self):
return super()._extension_tags[1:]
@property
def _content_span(self) -> Tuple[int, int]:
s = self.string
return s.find('>') + 1, s.rfind('<')