from bisect import bisect_left, bisect_right, insort_right
from html import unescape
from itertools import compress, islice
from operator import attrgetter
from typing import (
Callable,
Dict,
Generator,
Iterable,
List,
MutableSequence,
Optional,
Tuple,
Union,
)
from warnings import warn
from regex import (
DOTALL,
IGNORECASE,
MULTILINE,
VERBOSE,
finditer,
match,
search,
)
from wcwidth import wcswidth
# noinspection PyProtectedMember
from ._config import (
_HTML_TAG_NAME,
KNOWN_FILE_EXTENSIONS,
_bare_external_link_schemes,
_parsable_tag_extensions,
_tag_extensions,
regex_pattern,
)
from ._spans import (
BARE_EXTERNAL_LINK,
END_TAG_PATTERN,
EXTERNAL_LINK_URL_TAIL,
INVALID_URL_CHARS,
START_TAG_PATTERN,
parse_to_spans,
rc,
)
NAME_CAPTURING_HTML_START_TAG_FINDITER = rc(
START_TAG_PATTERN.replace(
b'{name}', rb'(?<name>' + _HTML_TAG_NAME + rb')', 1
)
).finditer
# External links
BRACKET_EXTERNAL_LINK_SCHEMES = regex_pattern(
_bare_external_link_schemes | {'//'}
)
BRACKET_EXTERNAL_LINK_URL = (
BRACKET_EXTERNAL_LINK_SCHEMES + EXTERNAL_LINK_URL_TAIL
)
BRACKET_EXTERNAL_LINK = rb'\[' + BRACKET_EXTERNAL_LINK_URL + rb'[^\]\n]*+\]'
EXTERNAL_LINK = (
rb'(?>' + BARE_EXTERNAL_LINK + rb'|' + BRACKET_EXTERNAL_LINK + rb')'
)
EXTERNAL_LINK_FINDITER = rc(EXTERNAL_LINK, IGNORECASE).finditer
INVALID_EXT_CHARS_SUB = rc( # the [:-4] slice allows \[ and \]
rb'[' + INVALID_URL_CHARS[:-4] + rb'{}]'
).sub
# Sections
SECTION_HEADING = rb'^(?<equals>={1,6})[^\n]+?(?P=equals)[ \t]*+$'
SUBSECTION_HEADING = rb'^(?P=equals)=[^\n]+?(?P=equals)=[ \t]*+$'
LEAD_SECTION = rb'(?<section>(?<equals>).*?)'
SECTIONS_FULLMATCH = rc(
LEAD_SECTION
+ rb'(?<section>'
+ SECTION_HEADING
+ rb'.*?' # heading # section content
rb')*',
DOTALL | MULTILINE | VERBOSE,
).fullmatch
SECTIONS_TOP_LEVELS_ONLY = rc(
LEAD_SECTION
+ rb'(?<section>'
+ SECTION_HEADING
+ rb'.*?'
+ SUBSECTION_HEADING
+ rb'.*?'
rb')*',
DOTALL | MULTILINE | VERBOSE,
).fullmatch
# Tables
TABLE_FINDITER = rc(
rb"""
# Table-start
# Always starts on a new line with optional leading spaces or indentation.
(?<=^[ :\0]*+)
{\| # Table contents
(?:
# Any character, as long as it is not indicating another table-start
(?!^\ *+\{\|).
)*?
# Table-end
\n\s*+
(?> \|} | \Z )
""",
DOTALL | MULTILINE | VERBOSE,
).finditer
substitute_apostrophes = rc(rb"('\0*+){2,}+(?=[^']|$)", MULTILINE).sub
BOLD_FINDITER = rc(
rb"""
# start token
'\0*+'\0*+'
# content
(\0*+[^'\n]++.*?)
# end token
(?:'\0*+'\0*+'|$)
""",
MULTILINE | VERBOSE,
).finditer
ITALIC_FINDITER = rc(
rb"""
# start token
'\0*+'
# content
(\0*+[^'\n]++.*?)
# end token
(?:'\0*+'|$)
""",
MULTILINE | VERBOSE,
).finditer
# Types which are detected by parse_to_spans
SPAN_PARSER_TYPES = {
'Template',
'ParserFunction',
'WikiLink',
'Comment',
'Parameter',
'ExtensionTag',
}
WS = '\r\n\t '
class DeadIndexError(TypeError):
pass
class DeadIndex(int):
"""Do not allow adding to another integer but allow usage in a slice.
Addition of indices is the main operation during mutation of WikiText
objects.
"""
__slots__ = ()
def __add__(self, o):
raise DeadIndexError(
'this usually means that the object has died '
'(overwritten or deleted) and cannot be mutated'
)
__radd__ = __add__
def __repr__(self):
return 'DeadIndex()'
DEAD_INDEX = DeadIndex() # == int() == 0
DEAD_SPAN = DEAD_INDEX, DEAD_INDEX, None, None
def _table_to_text(t: 'Table') -> str:
data = [
[(cell if cell is not None else '') for cell in row]
for row in t.data()
]
if not data:
return ''
widths = [0] * len(data[0])
for row in data:
for ri, d in enumerate(row[:-1]):
widths[ri] = max(widths[ri], wcswidth(d))
caption = t.caption
return (
(f'\n{caption}\n' if caption is not None else '')
+ '\n'
+ '\n'.join(
'\t'.join(f'{d:<{w}}' for (w, d) in zip(widths, r)) for r in data
)
+ '\n'
)
[docs]
class WikiText:
# In subclasses of WikiText _type is used as the key for _type_to_spans
# Therefore: self._span can be found in self._type_to_spans[self._type].
# The following class attribute acts as a default value.
_type = 'WikiText'
__slots__ = '_type_to_spans', '_lststr', '_span_data'
[docs]
def __init__(
self,
string: Union[MutableSequence[str], str],
_type_to_spans: Dict[str, List[List[int]]] = None,
) -> None:
"""Initialize the object.
Set the initial values for self._lststr, self._type_to_spans.
:param string: The string to be parsed or a list containing the string
of the parent object.
:param _type_to_spans: If the lststr is already parsed, pass its
_type_to_spans property as _type_to_spans to avoid parsing it
again.
"""
if _type_to_spans is not None:
self._type_to_spans = _type_to_spans
self._lststr = string # type: MutableSequence[str]
return
self._lststr = [string]
byte_array = bytearray(string, 'ascii', 'replace')
span = self._span_data = [0, len(string), None, byte_array]
_type = self._type
if _type not in SPAN_PARSER_TYPES:
type_to_spans = self._type_to_spans = parse_to_spans(byte_array)
type_to_spans[_type] = [span]
else:
# In SPAN_PARSER_TYPES, we can't pass the original byte_array to
# parser to generate the shadow because it will replace the whole
# string with '_'. Also, we can't just modify it before passing
# because the generated _type_to_spans will lack self._span.
# As a workaround we can add the missed span after parsing.
if type(self) is Parameter:
head = byte_array[:2]
tail = byte_array[-2:]
byte_array[:2] = b'__'
byte_array[-2:] = b'__'
else:
head = byte_array[0]
tail = byte_array[-1]
byte_array[0] = 3
byte_array[-1] = 32
type_to_spans = parse_to_spans(byte_array)
type_to_spans[_type].insert(0, span)
self._type_to_spans = type_to_spans
if type(self) is Parameter:
byte_array[:2] = head
byte_array[-2:] = tail
else:
byte_array[0] = head
byte_array[-1] = tail
[docs]
def __str__(self) -> str:
return self.string
[docs]
def __repr__(self) -> str:
return f'{type(self).__name__}({repr(self.string)})'
[docs]
def __contains__(self, value: Union[str, 'WikiText']) -> bool:
"""Return True if parsed_wikitext is inside self. False otherwise.
Also self and parsed_wikitext should belong to the same parsed
wikitext object for this function to return True.
"""
# Is it useful (and a good practice) to also accepts str inputs
# and check if self.string contains it?
if isinstance(value, str):
return value in self.string
# isinstance(value, WikiText)
if self._lststr is not value._lststr:
return False
ps, pe, _, _ = value._span_data
ss, se, _, _ = self._span_data
if ss <= ps and se >= pe:
return True
return False
def __len__(self):
s, e, _, _ = self._span_data
return e - s
[docs]
def __call__(
self, start: int, stop: Optional[int] = False, step: int = None
) -> str:
"""Return `self.string[start]` or `self.string[start:stop]`.
Return self.string[start] if stop is False.
Otherwise return self.string[start:stop:step].
"""
if stop is False:
if start >= 0:
return self._lststr[0][self._span_data[0] + start]
return self._lststr[0][self._span_data[1] + start]
s, e, _, _ = self._span_data
return self._lststr[0][
s
if start is None
else (s + start if start >= 0 else e + start) : e
if stop is None
else (s + stop if stop >= 0 else e + stop) : step
]
def _check_index(self, key: Union[slice, int]) -> (int, int):
"""Return adjusted start and stop index as tuple.
Used in __setitem__ and __delitem__.
"""
ss, se, _, _ = self._span_data
if isinstance(key, int):
if key < 0:
key += se - ss
if key < 0:
raise IndexError('index out of range')
elif key >= se - ss:
raise IndexError('index out of range')
start = ss + key
return start, start + 1
# isinstance(key, slice)
if key.step is not None:
raise NotImplementedError(
'step is not implemented for string setter.'
)
start = key.start or 0
stop = key.stop
if start < 0:
start += se - ss
if start < 0:
raise IndexError('start index out of range')
if stop is None:
stop = se - ss
elif stop < 0:
stop += se - ss
if start > stop:
raise IndexError(
'stop index out of range or start is after the stop'
)
return start + ss, stop + ss
[docs]
def __setitem__(self, key: Union[slice, int], value: str) -> None:
"""Set a new string for the given slice or character index.
Use this method instead of calling `insert` and `del` consecutively.
By doing so only one of the `_insert_update` and
`_shrink_update` functions will be called and the performance
will improve.
"""
abs_start, abs_stop = self._check_index(key)
# Update lststr
lststr = self._lststr
lststr0 = lststr[0]
lststr[0] = lststr0[:abs_start] + value + lststr0[abs_stop:]
# Set the length of all subspans to zero because
# they are all being replaced.
self._close_subspans(abs_start, abs_stop)
# Update the other spans according to the new length.
val_ba = bytearray(value, 'ascii', 'replace')
len_change = len(value) + abs_start - abs_stop
if len_change > 0:
self._insert_update(abs_start, len_change)
elif len_change < 0:
self._del_update(
rmstart=abs_stop + len_change,
rmstop=abs_stop, # new stop
) # old stop
# Add the newly added spans contained in the value.
type_to_spans = self._type_to_spans
for type_, value_spans in parse_to_spans(val_ba).items():
tts = type_to_spans[type_]
for s, e, m, ba in value_spans:
try:
insort_right(tts, [abs_start + s, abs_start + e, m, ba])
except TypeError:
# already exists which has lead to comparing Matches
continue
[docs]
def __delitem__(self, key: Union[slice, int]) -> None:
"""Remove the specified range or character from self.string.
Note: If an operation involves both insertion and deletion, it'll be
safer to use the `insert` function first. Otherwise there is a
possibility of insertion into the wrong spans.
"""
start, stop = self._check_index(key)
lststr = self._lststr
lststr0 = lststr[0]
lststr[0] = lststr0[:start] + lststr0[stop:]
# Update spans
self._del_update(start, stop)
# Todo: def __add__(self, other) and __radd__(self, other)
[docs]
def insert(self, index: int, string: str) -> None:
"""Insert the given string before the specified index.
This method has the same effect as ``self[index:index] = string``;
it only avoids some condition checks as it rules out the possibility
of the key being an slice, or the need to shrink any of the sub-spans.
"""
ss, se, _, _ = self._span_data
lststr = self._lststr
lststr0 = lststr[0]
if index < 0:
index += se - ss
if index < 0:
index = 0
elif index > se - ss: # Note that it is not >=. Index can be new.
index = se - ss
index += ss
# Update lststr
lststr[0] = lststr0[:index] + string + lststr0[index:]
string_len = len(string)
# Update spans
self._insert_update(index=index, length=string_len)
# Remember newly added spans by the string.
type_to_spans = self._type_to_spans
byte_array = bytearray(string, 'ascii', 'replace')
for type_, spans in parse_to_spans(byte_array).items():
for s, e, _, _ in spans:
insort_right(
type_to_spans[type_],
[index + s, index + e, None, byte_array],
)
@property
def span(self) -> tuple:
"""Return the span of self relative to the start of the root node."""
# In Python 3.7 and earlier, generalized iterable unpacking in yield
# and return statements requires enclosing parentheses:
# https://docs.python.org/3.8/whatsnew/3.8.html#other-language-changes
return (*self._span_data[:2],) # noqa
@property
def string(self) -> str:
"""Return str(self). Support get, set, and delete operations.
getter and deleter: Note that this will overwrite the current string,
emptying any object that points to the old string.
"""
start, end, _, _ = self._span_data
return self._lststr[0][start:end]
@string.setter
def string(self, newstring: str) -> None:
self[:] = newstring
@string.deleter
def string(self) -> None:
del self[:]
def _subspans(self, type_: str) -> List[List[int]]:
"""Return all the sub-span including self._span."""
return self._type_to_spans[type_]
def _close_subspans(self, start: int, stop: int) -> None:
"""Close all sub-spans of (start, stop)."""
ss, se, _, _ = self._span_data
for spans in self._type_to_spans.values():
b = bisect_left(spans, [start])
for i, (s, e, _, _) in enumerate(
spans[b : bisect_right(spans, [stop], b)]
):
if e <= stop:
if ss != s or se != e:
spans.pop(i + b)[:] = DEAD_SPAN
b -= 1
def _del_update(self, rmstart: int, rmstop: int) -> None:
"""Update self._type_to_spans according to the removed span."""
# Note: The following algorithm won't work correctly if spans
# are not sorted.
# Note: No span should be removed from _type_to_spans.
rmlength = rmstop - rmstart
for spans in self._type_to_spans.values():
i = len(spans) - 1
while i >= 0:
# todo update byte_array
s, e, _, b = span = spans[i]
if rmstop <= s:
# rmstart <= rmstop <= s <= e
# todo
span[:] = s - rmlength, e - rmlength, None, None
i -= 1
continue
break # pragma: no cover
else:
continue # pragma: no cover
while True:
if rmstart <= s:
if rmstop < e:
# rmstart < s <= rmstop < e
# todo: update byte_array instead
span[:] = rmstart, e - rmlength, None, None
i -= 1
if i < 0:
break
s, e, _, _ = span = spans[i]
continue
# rmstart <= s <= e < rmstop
spans.pop(i)[:] = DEAD_SPAN
i -= 1
if i < 0:
break
s, e, _, _ = span = spans[i]
continue
break # pragma: no cover
while i >= 0:
if e <= rmstart:
# s <= e <= rmstart <= rmstop
i -= 1
if i < 0:
break
s, e, _, _ = span = spans[i]
continue
# s <= rmstart <= rmstop <= e
span[1] -= rmlength
span[2] = None
# todo: update bytearray instead
span[3] = None
i -= 1
if i < 0:
break
s, e, _, _ = span = spans[i]
continue
def _insert_update(self, index: int, length: int) -> None:
"""Update self._type_to_spans according to the added length.
Warning: If an operation involves both _shrink_update and
_insert_update, you might wanna consider doing the
_insert_update before the _shrink_update as this function
can cause data loss in self._type_to_spans.
"""
self_span = ss, se, _, _ = self._span_data
for span_type, spans in self._type_to_spans.items():
for span in spans:
s0, s1, _, _ = span
if index < s1 or s1 == index == se:
span[1] += length
span[3] = None # todo: update instead
# index is before s0, or at s0 but span is not a parent
if index < s0 or (
s0 == index
and self_span is not span
and span_type != 'WikiText'
):
span[0] += length
def _nesting_level(self, parent_types) -> int:
ss, se, _, _ = self._span_data
level = 0
type_to_spans = self._type_to_spans
for type_ in parent_types:
spans = type_to_spans[type_]
for s, e, _, _ in spans[: bisect_right(spans, [ss + 1])]:
if se <= e:
level += 1
return level
@property
def _content_span(self) -> Tuple[int, int]:
# return content_start, self_len, self_end
return 0, len(self)
@property
def _shadow(self) -> bytearray:
"""Return a copy of self.string with specific sub-spans replaced.
Comments blocks are replaced by spaces. Other sub-spans are replaced
by underscores.
The replaced sub-spans are: (
'Template', 'WikiLink', 'ParserFunction', 'ExtensionTag',
'Comment',
)
This function is called upon extracting tables or extracting the data
inside them.
"""
ss, se, m, cached_shadow = span_data = self._span_data
if cached_shadow is not None:
return cached_shadow
shadow = span_data[3] = bytearray(
self._lststr[0][ss:se], 'ascii', 'replace'
)
if self._type in SPAN_PARSER_TYPES:
cs, ce = self._content_span
head = shadow[:cs]
tail = shadow[ce:]
shadow[:cs] = b'_' * cs
shadow[ce:] = b'_' * len(tail)
parse_to_spans(shadow)
shadow[:cs] = head
shadow[ce:] = tail
else:
parse_to_spans(shadow)
return shadow
def _inner_type_to_spans_copy(self) -> Dict[str, List[List[int]]]:
"""Create the arguments for the parse function used in pformat method.
Only return sub-spans and change them to fit the new scope, i.e
self.string.
"""
ss, se, _, _ = self._span_data
return {
type_: [
[s - ss, e - ss, m, ba[:] if ba is not None else None]
for s, e, m, ba in spans[
bisect_right(spans, [ss]) : bisect_right(spans, [se])
]
]
for type_, spans in self._type_to_spans.items()
}
[docs]
def plain_text(
self,
*,
replace_templates: Union[
bool, Callable[['Template'], Optional[str]]
] = True,
replace_parser_functions: Union[
bool, Callable[['ParserFunction'], Optional[str]]
] = True,
replace_parameters=True,
replace_tags=True,
replace_external_links=True,
replace_wikilinks=True,
unescape_html_entities=True,
replace_bolds_and_italics=True,
replace_tables: Union[
Callable[['Table'], Optional[str]], bool
] = _table_to_text,
_is_root_node=False,
) -> str:
# plain_text_doc will be added to __doc__
"""Return a plain text string representation of self."""
if _is_root_node is False:
s, e, m, b = self._span_data
tts = self._inner_type_to_spans_copy()
parsed = WikiText([self._lststr[0][s:e]], tts)
new_end = e - s
for span_data in tts[self._type]:
if span_data[1] == new_end:
parsed._span_data = span_data
break
else: # self is a dead span
parsed._span_data = [0, 0, None, bytearray()]
else:
tts = self._type_to_spans
parsed = self
lst = list(parsed.string)
def remove(b: int, e: int):
lst[b:e] = [None] * (e - b)
for b, e, _, _ in tts['Comment']:
remove(b, e)
if callable(replace_templates):
for template in parsed.templates:
b, e = template._span_data[:2] # noqa
if lst[b] is None: # overwritten
continue
lst[b] = replace_templates(template)
remove(b + 1, e)
elif replace_templates:
for b, e, _, _ in tts['Template']:
remove(b, e)
if callable(replace_parser_functions):
for pf in parsed.parser_functions:
b, e = pf._span_data[:2]
if lst[b] is None: # already overwritten
continue
lst[b] = replace_parser_functions(pf)
remove(b + 1, e)
elif replace_parser_functions:
for b, e, _, _ in tts['ParserFunction']:
remove(b, e)
if replace_external_links:
for el in parsed.external_links:
if el.in_brackets:
b, e = el.span
text = el.text
if text is None:
remove(b, e)
else:
remove(b, e - 1 - len(text))
remove(e - 1, e)
# replacing bold and italics should be done before wikilinks and tags
# because removing tags and wikilinks creates invalid spans, and
# get_bolds() will try to look into wikilinks for bold parts.
if replace_bolds_and_italics:
for i in parsed.get_bolds_and_italics():
b, e = i.span
ib, ie = i._match.span(1) # noqa, text span
remove(b, b + ib)
remove(b + ie, e)
if replace_parameters:
for p in parsed.parameters:
b, e = p.span
default_start = p._shadow.find(124)
if default_start != -1:
remove(b, b + default_start + 1)
remove(e - 3, e)
else:
remove(b, e)
if replace_tags:
for t in parsed.get_tags():
b, e = t.span
cb, ce = t._match.span('contents') # noqa
if cb != -1: # not a self-closing tag
remove(b, b + cb)
remove(b + ce, e)
else: # remove the whole self-closing tag
remove(b, e)
if replace_wikilinks:
for w in parsed.wikilinks:
b, e = w.span
title = w.title
if title[:1] != ':' and (
title.partition(':')[2].rpartition('.')[2]
in KNOWN_FILE_EXTENSIONS
):
remove(b, e) # image
else:
tb, te = w._match.span(4) # noqa, text span
if tb != -1:
remove(b, b + tb)
remove(b + te, e)
else:
tb, te = w._match.span(1) # noqa, target span
remove(b, b + tb)
remove(b + te, e)
if callable(replace_tables):
for table in parsed.get_tables():
b, e = table._span_data[:2] # noqa
if lst[b] is None: # overwritten
continue
lst[b] = replace_tables(
Table(''.join([c for c in lst[b:e] if c is not None]))
)
remove(b + 1, e)
string = ''.join([c for c in lst if c is not None])
if unescape_html_entities:
string = unescape(string)
return string
[docs]
def pformat(self, indent: str = ' ', remove_comments=False) -> str:
"""Return a pretty-print formatted version of `self.string`.
Try to organize templates and parser functions by indenting, aligning
at the equal signs, and adding space where appropriate.
Note that this function will not mutate self.
"""
ws = WS
# Do not try to do inplace pformat. It will overwrite on some spans.
lststr0 = self._lststr[0]
s, e, m, b = self._span_data
parsed = WikiText([lststr0[s:e]], self._inner_type_to_spans_copy())
# Since _type_to_spans arg of WikiText has been used, parsed._span
# is not set yet.
span = [0, e - s, m, b[:] if b is not None else None]
parsed._span_data = span
parsed._type_to_spans['WikiText'] = [span]
if remove_comments:
for c in parsed.comments:
del c[:]
else:
# Only remove comments that contain whitespace.
for c in parsed.comments:
if not c.contents.strip(ws):
del c[:]
# First remove all current spacings.
for template in reversed(parsed.templates):
stripped_tl_name = template.name.strip(ws)
template.name = (
' ' + stripped_tl_name + ' '
if stripped_tl_name[0] == '{'
else stripped_tl_name
)
args = template.arguments
if not args:
continue
if ':' in stripped_tl_name:
# Don't use False because we don't know for sure.
not_a_parser_function = None
else:
not_a_parser_function = True
# Required for alignment
arg_stripped_names = [a.name.strip(ws) for a in args]
arg_positionalities = [a.positional for a in args]
arg_name_lengths = [
wcswidth(n.replace('لا', '?')) if not p else 0
for n, p in zip(arg_stripped_names, arg_positionalities)
]
max_name_len = max(arg_name_lengths)
# Format template.name.
level = template.nesting_level
newline_indent = '\n' + indent * level
template.name += newline_indent
if level == 1:
last_comment_indent = '<!--\n-->'
else:
last_comment_indent = '<!--\n' + indent * (level - 2) + ' -->'
# Special formatting for the last argument.
last_arg = args.pop()
last_is_positional = arg_positionalities.pop()
last_value = last_arg.value
last_stripped_value = last_value.strip(ws)
if last_is_positional and last_value != last_stripped_value:
stop_conversion = True
if not last_value.endswith('\n' + indent * (level - 1)):
last_arg.value = last_value + last_comment_indent
elif not_a_parser_function:
stop_conversion = False
last_arg.name = (
' '
+ arg_stripped_names.pop()
+ ' '
+ ' ' * (max_name_len - arg_name_lengths.pop())
)
last_arg.value = (
' ' + last_stripped_value + '\n' + indent * (level - 1)
)
elif last_is_positional:
# (last_value == last_stripped_value
# and not_a_parser_function is not True)
stop_conversion = True
# Can't strip or adjust the position of the value
# because this could be a positional argument in a template.
last_arg.value = last_value + last_comment_indent
else:
stop_conversion = True
# This is either a parser function or a keyword
# argument in a template. In both cases the name
# can be lstripped and the value can be rstripped.
last_arg.name = ' ' + last_arg.name.lstrip(ws)
if not last_value.endswith('\n' + indent * (level - 1)):
last_arg.value = (
last_value.rstrip(ws) + ' ' + last_comment_indent
)
if not args:
continue
comment_indent = '<!--\n' + indent * (level - 1) + ' -->'
for arg, stripped_name, positional, arg_name_len in zip(
reversed(args),
reversed(arg_stripped_names),
reversed(arg_positionalities),
reversed(arg_name_lengths),
):
value = arg.value
stripped_value = value.strip(ws)
# Positional arguments of templates are sensitive to
# whitespace. See:
# https://meta.wikimedia.org/wiki/Help:Newlines_and_spaces
if stop_conversion:
if not value.endswith(newline_indent):
arg.value += comment_indent
elif positional and value != stripped_value:
stop_conversion = True
if not value.endswith(newline_indent):
arg.value += comment_indent
elif not_a_parser_function:
arg.name = (
' '
+ stripped_name
+ ' '
+ ' ' * (max_name_len - arg_name_len)
)
arg.value = ' ' + stripped_value + newline_indent
for func in reversed(parsed.parser_functions):
name = func.name
ls_name = name.lstrip(ws)
lws = len(name) - len(ls_name)
if lws:
del func[2 : lws + 2]
if ls_name.lower() in ('#tag', '#invoke', ''):
# The 2nd argument of `tag` parser function is an exception
# and cannot be stripped.
# So in `{{#tag:tagname|arg1|...}}`, no whitespace should be
# added/removed to/from arg1.
# See: [[mw:Help:Extension:ParserFunctions#Miscellaneous]]
# All args of #invoke are also whitespace-sensitive.
continue
args = func.arguments
if not args:
continue
# Whitespace, including newlines, tabs, and spaces is stripped
# from the beginning and end of all the parameters of
# parser functions. See:
# www.mediawiki.org/wiki/Help:Extension:ParserFunctions#
# Stripping_whitespace
level = func.nesting_level
short_indent = '\n' + indent * (level - 1)
newline_indent = short_indent + indent
if len(args) == 1:
arg = args[0]
# the first arg is both the first and last argument
if arg.positional:
arg.value = (
newline_indent + arg.value.strip(ws) + short_indent
)
else:
# Note that we don't add spaces before and after the
# '=' in parser functions because it could be part of
# an ordinary string.
arg.name = newline_indent + arg.name.lstrip(ws)
arg.value = arg.value.rstrip(ws) + short_indent
continue
# Special formatting for the first argument
arg = args[0]
if arg.positional:
arg.value = (
newline_indent + arg.value.strip(ws) + newline_indent
)
else:
arg.name = newline_indent + arg.name.lstrip(ws)
arg.value = arg.value.rstrip(ws) + newline_indent
# Formatting the middle arguments
for arg in args[1:-1]:
if arg.positional:
arg.value = ' ' + arg.value.strip(ws) + newline_indent
else:
arg.name = ' ' + arg.name.lstrip(ws)
arg.value = arg.value.rstrip(ws) + newline_indent
# Special formatting for the last argument
arg = args[-1]
if arg.positional:
arg.value = ' ' + arg.value.strip(ws) + short_indent
else:
arg.name = ' ' + arg.name.lstrip(ws)
arg.value = arg.value.rstrip(ws) + short_indent
return parsed.string
@property
def parameters(self) -> List['Parameter']:
"""Return a list of parameter objects."""
_lststr = self._lststr
_type_to_spans = self._type_to_spans
return [
Parameter(_lststr, _type_to_spans, span, 'Parameter')
for span in self._subspans('Parameter')
]
@property
def parser_functions(self) -> List['ParserFunction']:
"""Return a list of parser function objects."""
_lststr = self._lststr
_type_to_spans = self._type_to_spans
return [
ParserFunction(_lststr, _type_to_spans, span, 'ParserFunction')
for span in self._subspans('ParserFunction')
]
@property
def templates(self) -> List['Template']:
"""Return a list of templates as template objects."""
_lststr = self._lststr
_type_to_spans = self._type_to_spans
return [
Template(_lststr, _type_to_spans, span, 'Template')
for span in self._subspans('Template')
]
@property
def wikilinks(self) -> List['WikiLink']:
"""Return a list of wikilink objects."""
_lststr = self._lststr
_type_to_spans = self._type_to_spans
return [
WikiLink(_lststr, _type_to_spans, span, 'WikiLink')
for span in self._subspans('WikiLink')
]
@property
def comments(self) -> List['Comment']:
"""Return a list of comment objects."""
_lststr = self._lststr
_type_to_spans = self._type_to_spans
return [
Comment(_lststr, _type_to_spans, span, 'Comment')
for span in self._subspans('Comment')
]
@property
def _balanced_quotes_shadow(self) -> bytearray:
"""Return a byte array with non-markup-apostrophes removed.
The comments at /includes/parser/Parser.php:doQuotes are helpful:
https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
https://phabricator.wikimedia.org/T15227#178834
"""
bold_starts: List[int] = []
odd_italics = False
odd_bold_italics = False
append_bold_start = bold_starts.append
def process_line(line: bytes) -> bytes:
nonlocal odd_italics, odd_bold_italics
if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
# one of the bold marks needs to be interpreted as italic
first_multi_letter_word = first_space = None
for s in bold_starts:
if line[s - 1] == 32: # space
if first_space is None:
first_space = s
continue
if line[s - 2] == 32: # space
line = line[:s] + b' ' + line[s + 1 :]
break # first_single_letter_word
if first_multi_letter_word is None:
first_multi_letter_word = s
continue
else: # there was no first_single_letter_word
if first_multi_letter_word is not None:
line = (
line[:first_multi_letter_word]
+ b'_'
+ line[first_multi_letter_word + 1 :]
)
elif first_space is not None:
line = (
line[:first_space] + b'_' + line[first_space + 1 :]
)
# reset state for the next line
bold_starts.clear()
odd_italics = False
odd_bold_italics = False
return line
def process_apostrophes(m) -> bytes:
nonlocal odd_italics, odd_bold_italics
starts = m.starts(1)
n = len(starts)
if n == 2: # italic
odd_italics ^= True
return m[0]
if n == 3: # bold
append_bold_start(starts[0])
return m[0]
if n == 5:
odd_bold_italics ^= True
odd_italics ^= True
return m[0]
if n == 4: # four apostrophes -> hide the first one
s = starts[1]
append_bold_start(s)
return b'_' * (s - starts[0]) + m.string[s : m.end()]
# more than 5 apostrophes -> hide the prior ones
odd_bold_italics ^= True
odd_italics ^= True
s = starts[-5]
return b'_' * (s - starts[0]) + m.string[s : m.end()]
return bytearray(b'\n').join(
[
process_line(substitute_apostrophes(process_apostrophes, line))
for line in self._shadow.splitlines()
]
)
def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
for prop in (
'templates',
'parser_functions',
'parameters',
'wikilinks',
):
for e in getattr(self, prop):
result += e.get_bolds_and_italics(
filter_cls=filter_cls, recursive=False
)
extension_tags = self._extension_tags
if not extension_tags:
return result
# noinspection PyProtectedMember
result_spans = {(*i._span_data[:2],) for i in result}
for e in extension_tags:
for i in e.get_bolds_and_italics(
filter_cls=filter_cls, recursive=False
):
# noinspection PyProtectedMember
if (*i._span_data[:2],) not in result_spans:
result.append(i)
[docs]
def get_bolds_and_italics(
self, *, recursive=True, filter_cls: type = None
) -> List[Union['Bold', 'Italic']]:
"""Return a list of bold and italic objects in self.
This is faster than calling ``get_bolds`` and ``get_italics``
individually.
:keyword recursive: if True also look inside templates, parser
functions, extension tags, etc.
:keyword filter_cls: only return this type. Should be
`wikitextparser.Bold` or `wikitextparser.Italic`.
The default is None and means both bolds and italics.
"""
result = []
append = result.append
_lststr = self._lststr
s = self._span_data[0]
type_to_spans = self._type_to_spans
tts_setdefault = type_to_spans.setdefault
balanced_shadow = self._balanced_quotes_shadow
rs, re = self._content_span
if filter_cls is None or filter_cls is Bold:
bold_spans = tts_setdefault('Bold', [])
get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
for m in bold_matches:
ms, me = m.span()
b, e = s + ms, s + me
old_span = get_old_bold_span((b, e))
if old_span is None:
span = [b, e, None, balanced_shadow[ms:me]]
insort_right(bold_spans, span)
else:
span = old_span
append(Bold(_lststr, type_to_spans, span, 'Bold'))
if recursive:
self._bolds_italics_recurse(result, filter_cls)
if filter_cls is Bold:
result.sort(key=attrgetter('_span_data'))
return result
elif filter_cls is Bold:
return result
else: # filter_cls is Italic
bold_matches = BOLD_FINDITER(balanced_shadow, rs, re)
# filter_cls is None or filter_cls is Italic
# remove bold tokens before searching for italics
for m in bold_matches:
ms, me = m.span()
cs, ce = m.span(1) # content
balanced_shadow[ms:cs] = b'_' * (cs - ms)
balanced_shadow[ce:me] = b'_' * (me - ce)
italic_spans = tts_setdefault('Italic', [])
get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
for m in ITALIC_FINDITER(balanced_shadow, rs, re):
ms, me = m.span()
b, e = span = s + ms, s + me
old_span = get_old_italic_span(span)
if old_span is None:
span = [b, e, None, balanced_shadow[ms:me]]
insort_right(italic_spans, span)
else:
span = old_span
append(
Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
)
if recursive and filter_cls is Italic:
self._bolds_italics_recurse(result, filter_cls)
result.sort(key=attrgetter('_span_data'))
return result
if filter_cls is None: # all Italics are appended after Bolds
result.sort(key=attrgetter('_span_data'))
return result
[docs]
def get_bolds(self, recursive=True) -> List['Bold']:
"""Return bold parts of self.
:param recursive: if True also look inside templates, parser functions,
extension tags, etc.
"""
return self.get_bolds_and_italics(filter_cls=Bold, recursive=recursive)
[docs]
def get_italics(self, recursive=True) -> List['Italic']:
"""Return italic parts of self.
:param recursive: if True also look inside templates, parser functions,
extension tags, etc.
"""
return self.get_bolds_and_italics(
filter_cls=Italic, recursive=recursive
)
@property
def _ext_link_shadow(self):
"""Replace the invalid chars of SPAN_PARSER_TYPES with b'_'.
For comments, all characters are replaced, but for ('Template',
'ParserFunction', 'Parameter') only invalid characters are replaced.
"""
ss, se, _, _ = self._span_data
byte_array = bytearray(self._lststr[0][ss:se], 'ascii', 'replace')
subspans = self._subspans
for s, e, _, _ in subspans('Comment'):
byte_array[s:e] = (e - s) * b'_'
for s, e, _, _ in subspans('WikiLink'):
byte_array[s:e] = (e - s) * b' '
for type_ in 'Template', 'ParserFunction', 'Parameter':
for s, e, _, _ in subspans(type_):
byte_array[s:e] = INVALID_EXT_CHARS_SUB(b' ', byte_array[s:e])
return byte_array
@property
def external_links(self) -> List['ExternalLink']:
"""Return a list of found external link objects.
Note:
Templates adjacent to external links are considered part of the
link. In reality, this depends on the contents of the template:
>>> WikiText(
... 'http://example.com{{dead link}}'
...).external_links[0].url
'http://example.com{{dead link}}'
>>> WikiText(
... '[http://example.com{{space template}} text]'
...).external_links[0].url
'http://example.com{{space template}}'
"""
external_links = [] # type: List['ExternalLink']
external_links_append = external_links.append
type_to_spans = self._type_to_spans
lststr = self._lststr
ss, se, _, _ = self._span_data
spans = type_to_spans.setdefault('ExternalLink', [])
span_tuple_to_span_get = {(s[0], s[1]): s for s in spans}.get
el_shadow = self._ext_link_shadow
def _extract(start, end):
for m in EXTERNAL_LINK_FINDITER(el_shadow, start, end):
ms, me = m.span()
span = s, e, _, _ = [ss + ms, ss + me, None, el_shadow[ms:me]]
old_span = span_tuple_to_span_get((s, e))
if old_span is None:
insort_right(spans, span)
else:
span = old_span
external_links_append(
ExternalLink(lststr, type_to_spans, span, 'ExternalLink')
)
for s, e, _, _ in self._subspans('ExtensionTag'):
_extract(s, e)
el_shadow[s:e] = (e - s) * b' '
_extract(None, None)
return external_links
def _section_spans_to_sections(
self, section_spans: List[Tuple[int, int]], shadow: bytearray
) -> List['Section']:
type_to_spans = self._type_to_spans
sections: List[Section] = []
sections_append = sections.append
ss, se, _, ba = self._span_data
type_spans = type_to_spans.setdefault('Section', [])
span_tuple_to_span = {(s[0], s[1]): s for s in type_spans}.get
lststr = self._lststr
for ms, me in section_spans:
s, e = ss + ms, ss + me
old_span = span_tuple_to_span((s, e))
if old_span is None:
span = [s, e, None, shadow[ms:me]]
insort_right(type_spans, span)
else:
span = old_span
sections_append(Section(lststr, type_to_spans, span, 'Section'))
return sections
@property
def sections(self) -> List['Section']:
"""Return self.get_sections(include_subsections=True)."""
return self.get_sections()
[docs]
def get_sections(
self,
*args,
include_subsections=True,
level=None,
top_levels_only=False,
) -> List['Section']:
"""Return a list of sections in current wikitext.
The first section will always be the lead section, even if it is an
empty string.
:param include_subsections: If true, include the text of subsections
in each Section object.
:param level: Only return sections where section.level == level.
Return all levels if None (default).
:param top_levels_only: Only return sections that are not subsections
of other sections. In this mode, level cannot be specified and
`include_subsections` must be True.
"""
if args:
warn(
'calling get_sections with positional arguments is deprecated',
DeprecationWarning,
2,
)
if len(args) == 1:
include_subsections = args[0]
else:
include_subsections, level = args
shadow = self._shadow
if top_levels_only:
assert level is None
assert include_subsections
full_match = SECTIONS_TOP_LEVELS_ONLY(shadow)
return self._section_spans_to_sections(
full_match.spans('section'), shadow
)
full_match = SECTIONS_FULLMATCH(shadow)
section_spans = full_match.spans('section')
levels = [len(eq) for eq in full_match.captures('equals')]
if include_subsections:
z = [*zip(section_spans, levels)]
for pi, ((ps, pe), pl) in enumerate(islice(z, 1, None), 1):
for (ss, se), sl in islice(z, pi + 1, None):
if sl > pl:
section_spans[pi] = (ps, se)
else:
break
if level is not None:
section_spans = compress(
section_spans, [lvl == level for lvl in levels]
)
return self._section_spans_to_sections(section_spans, shadow)
@property
def tables(self) -> List['Table']:
"""Return a list of all tables."""
return self.get_tables(True)
[docs]
def get_tables(self, recursive=False) -> List['Table']:
"""Return tables. Include nested tables if `recursive` is `True`."""
type_to_spans = self._type_to_spans
lststr = self._lststr
shadow_copy = self._shadow[:]
ss, se, _, _ = self._span_data
spans = type_to_spans.setdefault('Table', [])
spans_append = spans.append
skip_self_span = self._type == 'Table'
span_tuple_to_span_get = {(s[0], s[1]): s for s in spans}.get
return_spans = []
return_spans_append = return_spans.append
shadow_copy_copy = shadow_copy[:]
def extract_tables_from_shadow():
m = True
while m:
m = False
for m in TABLE_FINDITER(shadow_copy, skip_self_span):
ms, me = m.span()
# Ignore leading whitespace using len(m[1]).
s, e = ss + ms, ss + me
old_span = span_tuple_to_span_get((s, e))
if old_span is None:
span = [s, e, None, shadow_copy_copy[ms:me]]
spans_append(span)
return_spans_append(span)
else:
return_spans_append(old_span)
shadow_copy[ms:me] = b'_' * (me - ms)
extract_tables_from_shadow()
for tag in self._extension_tags:
if tag.name in _parsable_tag_extensions:
shadow_copy = tag._shadow[:]
shadow_copy_copy = shadow_copy[:]
# noinspection PyProtectedMember
ss = tag._span_data[0]
extract_tables_from_shadow()
return_spans.sort()
spans.sort()
if not recursive:
return_spans = _outer_spans(return_spans)
return [
Table(lststr, type_to_spans, sp, 'Table') for sp in return_spans
]
@property
def _lists_shadow_ss(self) -> Tuple[bytearray, int]:
"""Return appropriate shadow and its offset to be used by `lists`."""
return self._shadow, self._span_data[0]
[docs]
def get_lists(
self, pattern: Union[str, Tuple[str]] = (r'\#', r'\*', '[:;]')
) -> List['WikiList']:
r"""Return a list of WikiList objects.
:param pattern: The starting pattern for list items.
If pattern is not None, it will be passed to the regex engine,
so remember to escape the `*` character. Examples:
- `'\#'` means top-level ordered lists
- `'\#\*'` means unordred lists inside an ordered one
- Currently definition lists are not well supported, but you
can use `'[:;]'` as their pattern.
Tips and tricks:
Be careful when using the following patterns as they will
probably cause malfunction in the `sublists` method of the
resultant List. (However don't worry about them if you are
not going to use the `sublists` or `List.get_lists` method.)
- Use `'\*+'` as a pattern and nested unordered lists will be
treated as flat.
- Use `'\*\s*'` as pattern to rtstrip `items` of the list.
"""
if isinstance(pattern, str):
patterns = (pattern,)
else:
patterns = pattern
lists = []
lists_append = lists.append
lststr = self._lststr
type_to_spans = self._type_to_spans
spans = type_to_spans.setdefault('WikiList', [])
span_tuple_to_span_get = {(s[0], s[1]): s for s in spans}.get
shadow, ss = self._lists_shadow_ss
if any(':' in pattern for pattern in patterns):
for m in EXTERNAL_LINK_FINDITER(shadow):
s, e = m.span()
shadow[s:e] = b'_' * (e - s)
for pattern in patterns:
for m in finditer(
LIST_PATTERN_FORMAT.replace(b'{pattern}', pattern.encode(), 1),
shadow,
MULTILINE,
):
ms, me = m.span()
s, e = ss + ms, ss + me
old_span = span_tuple_to_span_get((s, e))
if old_span is None:
span = [s, e, None, shadow[ms:me]]
insort_right(spans, span)
else:
span = old_span
lists_append(
WikiList(
lststr, pattern, m, type_to_spans, span, 'WikiList'
)
)
lists.sort(key=attrgetter('_span_data'))
return lists
@property
def _extension_tags(self):
lststr = self._lststr
type_to_spans = self._type_to_spans
return [
Tag(lststr, type_to_spans, span, 'ExtensionTag')
for span in self._subspans('ExtensionTag')
]
[docs]
def get_tags(self, name=None) -> List['Tag']:
"""Return all tags with the given name."""
lststr = self._lststr
type_to_spans = self._type_to_spans
if name:
if name in _tag_extensions:
string = lststr[0]
return [
Tag(lststr, type_to_spans, span, 'ExtensionTag')
for span in type_to_spans['ExtensionTag']
if match(r'<' + name + r'\b', string, pos=span[0])
is not None
]
tags = [] # type: List['Tag']
else:
# There is no name, add all extension tags. Before using shadow.
tags = self._extension_tags
tags_append = tags.append
# Get the left-most start tag, match it to right-most end tag
# and so on.
ss = self._span_data[0]
shadow = self._shadow
if name:
# There is a name but it is not in TAG_EXTENSIONS.
reversed_start_matches = reversed(
[
m
for m in rc(
START_TAG_PATTERN.replace(
rb'{name}', rb'(?P<name>' + name.encode() + rb')'
)
).finditer(shadow)
]
)
end_search = rc(
END_TAG_PATTERN.replace(b'{name}', name.encode())
).search
else:
reversed_start_matches = reversed(
[m for m in NAME_CAPTURING_HTML_START_TAG_FINDITER(shadow)]
)
shadow_copy = shadow[:]
spans = type_to_spans.setdefault('Tag', [])
span_tuple_to_span_get = {(s[0], s[1]): s for s in spans}.get
spans_append = spans.append
for start_match in reversed_start_matches:
if start_match[0].rstrip(b' \t\n>')[-1] == 47: # ord('/') == 47
# Self-closing tag. Don't look for the end tag.
# todo: some self-closing tags actually should be treated
# as start tag in HTML5, see:
# https://stackoverflow.com/questions/3558119/
ms, me = start_match.span()
span = [ss + ms, ss + me, None, shadow_copy[ms:me]]
else:
# look for the end-tag
sms, sme = start_match.span()
if name:
# the end_search is already available
# noinspection PyUnboundLocalVariable
end_match = end_search(shadow_copy, sme)
else:
# build end_search according to start tag name
end_match = search(
END_TAG_PATTERN.replace(
b'{name}', start_match['name']
),
shadow_copy,
pos=sme,
)
if end_match:
ems, eme = end_match.span()
shadow_copy[ems:eme] = b'_' * (eme - ems)
span = [ss + sms, ss + eme, None, shadow[sms:eme]]
else:
# Assume start-only tag.
span = [ss + sms, ss + sme, None, shadow_copy[sms:sme]]
old_span = span_tuple_to_span_get((span[0], span[1]))
if old_span is None:
spans_append(span)
else:
span = old_span
tags_append(Tag(lststr, type_to_spans, span, 'Tag'))
spans.sort()
tags.sort(key=attrgetter('_span_data'))
return tags
[docs]
@staticmethod
def parent(type_: Optional[str] = None) -> Optional['WikiText']:
"""Return None (The parent of the root node is None)."""
return None
[docs]
@staticmethod
def ancestors(type_: Optional[str] = None) -> list:
"""Return [] (the root node has no ancestors)."""
return []
[docs]
class SubWikiText(WikiText):
"""Define a class to be inherited by some subclasses of WikiText.
Allow focusing on a particular part of WikiText.
"""
__slots__ = '_type'
[docs]
def __init__(
self,
string: Union[str, MutableSequence[str]],
_type_to_spans: Optional[Dict[str, List[List[int]]]] = None,
_span: Optional[List[int]] = None,
_type: Optional[Union[str, int]] = None,
) -> None:
"""Initialize the object."""
if _type is None:
# assert _span is None
# assert _type_to_spans is None
# https://youtrack.jetbrains.com/issue/PY-29770
# noinspection PyDunderSlots,PyUnresolvedReferences
self._type = _type = type(self).__name__
super().__init__(string)
else:
# assert _span is not None
# assert _type_to_spans is not None
# https://youtrack.jetbrains.com/issue/PY-29770
# noinspection PyDunderSlots,PyUnresolvedReferences
self._type = _type
super().__init__(string, _type_to_spans)
self._span_data = _span
def _subspans(self, _type: str) -> Generator[int, None, None]:
"""Yield all the sub-span indices excluding self._span."""
ss, se, _, _ = self._span_data
spans = self._type_to_spans[_type]
# Do not yield self._span by bisecting for s < ss.
# The second bisect is an optimization and should be on [se + 1],
# but empty spans are not desired thus [se] is used.
b = bisect_left(spans, [ss])
for span in spans[b : bisect_right(spans, [se], b)]:
if span[1] <= se:
yield span
# noinspection PyProtectedMember
[docs]
def ancestors(self, type_: Optional[str] = None) -> List['WikiText']:
"""Return the ancestors of the current node.
:param type_: the type of the desired ancestors as a string.
Currently the following types are supported: {Template,
ParserFunction, WikiLink, Comment, Parameter, ExtensionTag}.
The default is None and means all the ancestors of any type above.
"""
if type_ is None:
types = SPAN_PARSER_TYPES
else:
types = (type_,)
lststr = self._lststr
type_to_spans = self._type_to_spans
ss, se, _, _ = self._span_data
ancestors = []
ancestors_append = ancestors.append
for type_ in types:
cls = globals()[type_]
spans = type_to_spans[type_]
for span in spans[: bisect_right(spans, [ss])]:
if se < span[1]:
ancestors_append(cls(lststr, type_to_spans, span, type_))
return sorted(ancestors, key=lambda i: ss - i._span_data[0])
[docs]
def parent(self, type_: Optional[str] = None) -> Optional['WikiText']:
"""Return the parent node of the current object.
:param type_: the type of the desired parent object.
Currently the following types are supported: {Template,
ParserFunction, WikiLink, Comment, Parameter, ExtensionTag}.
The default is None and means the first parent, of any type above.
:return: parent WikiText object or None if no parent with the desired
`type_` is found.
"""
ancestors = self.ancestors(type_)
if ancestors:
return ancestors[0]
return None
def _outer_spans(sorted_spans: List[List[int]]) -> Iterable[List[int]]:
"""Yield the outermost intervals."""
for i, span in enumerate(sorted_spans):
se = span[1]
for ps, pe, _, _ in islice(sorted_spans, None, i):
if se < pe:
break
else: # none of the previous spans included span
yield span
def remove_markup(s: str, **kwargs) -> str:
# plain_text_doc will be added to __doc__
"""Return a string with wiki markup removed/replaced."""
return WikiText(s).plain_text(**kwargs, _is_root_node=True)
plain_text_doc = """
Comments are always removed.
:keyword replace_templates:
A function mapping `Template` objects to strings.
If True, replace `{{template|argument}}`s with `''`.
If False, ignore templates.
:keyword replace_parser_functions:
A function mapping `ParserFunction` objects to strings.
If True, replace `{{#parser_function:argument}}`s with `''`.
If False, ignore parser functions.
:keyword replace_parameters: Replace `{{{a}}}` with `` and {{{a|b}}}
with `b`.
:keyword replace_tags: Replace `<s>text</s>` with `text`.
:keyword replace_external_links: Replace `[https://wikimedia.org/ wm]`
with `wm`, and `[https://wikimedia.org/]` with ``.
:keyword replace_wikilinks: Replace wikilinks with their text
representation, e.g. `[[a|b]]` with `b` and `[[a]]` with `a`.
:keyword unescape_html_entities: Replace HTML entities like `Σ`,
`Σ`, and `Σ` with `Σ`.
:keyword replace_bolds: replace `'''b'''` with `b`.
:keyword replace_italics: replace `''i''` with `i`.
"""
WikiText.plain_text.__doc__ += plain_text_doc
remove_markup.__doc__ += plain_text_doc
if __name__ == '__main__':
# To make PyCharm happy! http://stackoverflow.com/questions/41524090
from ._comment_bold_italic import Bold, Comment, Italic
from ._externallink import ExternalLink
from ._parameter import Parameter
from ._parser_function import ParserFunction
from ._section import Section
from ._table import Table
from ._tag import Tag
from ._template import Template
from ._wikilink import WikiLink
from ._wikilist import LIST_PATTERN_FORMAT, WikiList