from bisect import insort_right
from collections.abc import Mapping
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
from regex import DOTALL, VERBOSE
from ._cell import (
INLINE_HAEDER_CELL_MATCH,
INLINE_NONHAEDER_CELL_MATCH,
NEWLINE_CELL_MATCH,
Cell,
)
from ._spans import ATTRS_MATCH
from ._tag import SubWikiTextWithAttrs
from ._wikitext import WS, rc
CAPTION_MATCH = rc(
rb"""
# Everything until the caption line
(?P<preattrs>
# Start of table
{\|
(?:
(?:
(?!\n\s*+\|)
[\s\S]
)*?
)
# Start of caption line
\n\s*+\|\+
)
# Optional caption attrs
(?:
(?P<attrs>[^\n|]*+)
\|(?!\|)
)?
(?P<caption>.*?)
(?:\n[\|\!]|\|\|)
""",
DOTALL | VERBOSE,
).match
T = TypeVar('T')
FIND_ROWS = rc(rb'\|-(.*)').finditer
HEAD_DIGITS = rc(rb'\s*+\d+').match
# Captions are optional and only one should be placed between table-start
# and the first row. Others captions are not part of the table and will
# be ignored.
FIRST_NON_CAPTION_LINE = rc(rb'\n[\t \0]*+(\|(?!\+)|!)|\Z').search
def head_int(value):
if value is None:
return 1
match = HEAD_DIGITS(value)
return 1 if match is None else int(match[0])
[docs]
class Table(SubWikiTextWithAttrs):
__slots__ = '_attrs_match_cache'
[docs]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._attrs_match_cache = None, None
@property
def nesting_level(self) -> int:
"""Return the nesting level of self.
The minimum nesting_level is 0. Being part of any Table increases
the level by one.
"""
return self._nesting_level(('Table',)) - 1
@property
def _table_shadow(self) -> bytearray:
"""Remove Table spans from shadow and return it."""
shadow = self._shadow[:]
ss = self._span_data[0]
for s, e, _, _ in self._subspans('Table'):
if s == ss:
continue
shadow[s - ss : e - ss] = b'#' * (e - s)
return shadow
@property
def _match_table(self) -> List[List[Any]]:
"""Return match_table."""
table_shadow = self._table_shadow
# Remove table-start and table-end marks.
pos = table_shadow.find(10) # ord('\n')
lsp = _lstrip_increase(table_shadow, pos)
# Remove everything until the first row
try:
# while condition may raise IndexError of table is empty
while table_shadow[lsp] not in b'!|':
nlp = table_shadow.find(10, lsp) # ord('\n')
pos = nlp
lsp = _lstrip_increase(table_shadow, pos)
except IndexError:
return [[]]
# Start of the first row
match_table = []
pos = FIRST_NON_CAPTION_LINE(table_shadow, pos).start()
rsp = _row_separator_increase(table_shadow, pos)
pos = -1
while pos != rsp:
pos = rsp
# We have a new row.
m = NEWLINE_CELL_MATCH(table_shadow, pos)
# Don't add a row if there are no new cells.
if m:
match_row = [] # type: List[Any]
match_table.append(match_row)
while m is not None:
match_row.append(m)
sep = m['sep']
pos = m.end()
if sep == b'|':
m = INLINE_NONHAEDER_CELL_MATCH(table_shadow, pos)
while m is not None:
match_row.append(m)
pos = m.end()
m = INLINE_NONHAEDER_CELL_MATCH(table_shadow, pos)
elif sep == b'!':
m = INLINE_HAEDER_CELL_MATCH(table_shadow, pos)
while m is not None:
match_row.append(m)
pos = m.end()
m = INLINE_HAEDER_CELL_MATCH(table_shadow, pos)
pos = FIRST_NON_CAPTION_LINE(table_shadow, pos).start()
m = NEWLINE_CELL_MATCH(table_shadow, pos)
rsp = _row_separator_increase(table_shadow, pos)
return match_table
[docs]
def data(
self,
span: bool = True,
strip: bool = True,
row: int = None,
column: int = None,
) -> Union[List[List[str]], List[str], str]:
"""Return a list containing lists of row values.
:param span: If true, calculate rows according to rowspans and colspans
attributes. Otherwise ignore them.
:param row: Return the specified row only. Zero-based index.
:param column: Return the specified column only. Zero-based index.
:param strip: strip data values
Note: Due to the lots of complications that it may cause, this function
won't look inside templates, parser functions, etc.
See https://www.mediawiki.org/wiki/Extension:Pipe_Escape for how
wiki-tables can be inserted within templates.
"""
match_table = self._match_table
# Note string is only used for extracting data, matching is done over
# the shadow.
string = self.string
table_data = [] # type: List[List[str]]
if strip:
for match_row in match_table:
row_data = [] # type: List[str]
table_data.append(row_data)
for m in match_row:
# Spaces after the first newline can be meaningful
s, e = m.span('data')
row_data.append(string[s:e].lstrip(' ').rstrip(WS))
else:
for match_row in match_table:
row_data = []
table_data.append(row_data)
for m in match_row:
s, e = m.span('data')
row_data.append(string[s:e])
if table_data:
if span:
table_attrs = [] # type: List[List[Dict[str, str]]]
for match_row in match_table:
row_attrs = [] # type: List[Dict[str, str]]
table_attrs.append(row_attrs)
row_attrs_append = row_attrs.append
for m in match_row:
s, e = m.span('attrs')
captures = ATTRS_MATCH(
string.encode('ascii', 'replace'), s, e
).captures
row_attrs_append(
dict(
zip(
captures('attr_name'),
captures('attr_value'),
)
)
)
table_data = _apply_attr_spans(table_attrs, table_data)
if row is None:
if column is None:
return table_data
return [r[column] for r in table_data]
if column is None:
return table_data[row]
return table_data[row][column]
[docs]
def cells(
self,
row: int = None,
column: int = None,
span: bool = True,
) -> Union[List[List[Cell]], List[Cell], Cell]:
"""Return a list of lists containing Cell objects.
:param span: If is True, rearrange the result according to colspan and
rospan attributes.
:param row: Return the specified row only. Zero-based index.
:param column: Return the specified column only. Zero-based index.
If both row and column are provided, return the relevant cell object.
If only need the values inside cells, then use the ``data`` method
instead.
"""
tbl_span = self._span_data
ss = tbl_span[0]
match_table = self._match_table
shadow = self._shadow
type_ = id(tbl_span)
type_to_spans = self._type_to_spans
spans = type_to_spans.setdefault(type_, [])
table_cells = [] # type: List[List[Cell]]
table_attrs = [] # type: List[List[Dict[str, str]]]
attrs_match = None
for match_row in match_table:
row_cells = [] # type: List[Cell]
table_cells.append(row_cells)
if span:
row_attrs = [] # type: List[Dict[str, str]]
table_attrs.append(row_attrs)
row_attrs_append = row_attrs.append
for m in match_row:
header = m['sep'] == b'!'
ms, me = m.span()
cell_span = [ss + ms, ss + me, None, shadow[ms:me]]
if span:
s, e = m.span('attrs')
# Note: ATTRS_MATCH always matches, even to empty strings.
# Also ATTRS_MATCH should match against the cell string
# so that it can be used easily as cache later in Cells.
attrs_match = ATTRS_MATCH(shadow[ms:me], s - ms, e - ms)
captures = attrs_match.captures
# noinspection PyUnboundLocalVariable
row_attrs_append(
dict(
zip(captures('attr_name'), captures('attr_value'))
)
)
old_span = next((s for s in spans if s == cell_span), None)
if old_span is None:
insort_right(spans, cell_span)
else:
cell_span = old_span
row_cells.append(
Cell(
self._lststr,
header,
type_to_spans,
cell_span,
type_,
m,
attrs_match,
)
)
if table_cells and span:
table_cells = _apply_attr_spans(table_attrs, table_cells)
if row is None:
if column is None:
return table_cells
return [r[column] for r in table_cells]
if column is None:
return table_cells[row]
return table_cells[row][column]
@property
def caption(self) -> Optional[str]:
"""Caption of the table. Support get and set."""
m = CAPTION_MATCH(self._shadow)
if m:
return self(*m.span('caption'))
return None
@caption.setter
def caption(self, newcaption: str) -> None:
shadow = self._shadow
m = CAPTION_MATCH(shadow)
if m:
s = m.end('attrs')
self[s if s != -1 else m.end('preattrs') : m.end('caption')] = (
newcaption
)
return
# There is no caption. Create one.
h, s, t = shadow.partition(b'\n')
# Insert caption after the first one.
self.insert(len(h + s), '|+' + newcaption + '\n')
@property
def _attrs_match(self) -> Any:
cache_match, cache_string = self._attrs_match_cache
string = self.string
if cache_string == string:
return cache_match
shadow = self._shadow
attrs_match = ATTRS_MATCH(shadow, 2, shadow.find(10)) # ord('\n')
self._attrs_match_cache = attrs_match, string
return attrs_match
@property
def caption_attrs(self) -> Optional[str]:
"""Caption attributes. Support get and set operations."""
m = CAPTION_MATCH(self._shadow)
if m:
s, e = m.span('attrs')
if s != -1:
return self(s, e)
return None
@caption_attrs.setter
def caption_attrs(self, attrs: str) -> None:
shadow = self._shadow
h, s, t = shadow.partition(b'\n')
m = CAPTION_MATCH(shadow)
if not m: # There is no caption-line
self.insert(len(h + s), '|+' + attrs + '|\n')
else: # Caption and attrs or Caption but no attrs
end = m.end('attrs')
if end != -1:
self[m.end('preattrs') : end] = attrs
@property
def row_attrs(self) -> List[dict]:
"""Row attributes.
Use the setter of this property to set attributes for all rows.
Note that it will overwrite all the existing attr values.
"""
shadow = self._table_shadow
string = self.string
attrs = []
append = attrs.append
for row_match in FIND_ROWS(shadow):
s, e = row_match.span(1)
spans = ATTRS_MATCH(shadow, s, e).spans
append(
{
string[ns:ne]: string[vs:ve]
for (ns, ne), (vs, ve) in zip(
spans('attr_name'), spans('attr_value')
)
}
)
return attrs
@row_attrs.setter
def row_attrs(self, attrs: List[Mapping]):
for row_match, attrs_dict in reversed(
[*zip(FIND_ROWS(self._table_shadow), attrs)]
):
s, e = row_match.span(1)
del self[s:e]
self.insert(
s,
''.join(
[
f' {name}="{value}"' if value else f' {name}'
for name, value in attrs_dict.items()
]
),
)
def _apply_attr_spans(
table_attrs: List[List[Dict[str, str]]], table_data: List[List[T]]
) -> List[List[T]]:
"""Apply row and column spans and return table_data."""
# The following code is based on the table forming algorithm described
# at http://www.w3.org/TR/html5/tabular-data.html#processing-model-1
# Numeral comments indicate the steps in that algorithm.
# 1, 2, 10
ycurrent = yheight = xwidth = 0
# 4
# The xwidth and yheight variables give the table's dimensions.
# The table is initially empty.
table = [] # type: List[List[Optional[T]]]
append_row = table.append
# Table.data won't call this function if table_data is empty.
# 5
# if not table_data:
# return table_data
# 11
downward_growing_cells: List[Tuple[Optional[T], int, int]] = []
# 13, 18
# Algorithm for processing rows
for attrs_row, row in zip(table_attrs, table_data):
# 13.1 ycurrent is never greater than yheight
if yheight == ycurrent:
yheight += 1
append_row([None] * xwidth)
# 13.2
xcurrent = 0
# 13.3
# The algorithm for growing downward-growing cells
for cell, cellx, width in downward_growing_cells:
r = table[ycurrent]
for x in range(cellx, cellx + width):
r[x] = cell
# 13.4 will be handled by the following for-loop.
# 13.5, 13.16
for attrs, current_cell in zip(attrs_row, row):
# 13.6
attrs_get = attrs.get
while xcurrent < xwidth and table[ycurrent][xcurrent] is not None:
xcurrent += 1
# 13.7
if xcurrent == xwidth:
# xcurrent is never greater than xwidth
xwidth += 1
for r in table:
if xwidth > len(r):
r.extend([None] * (xwidth - len(r)))
# 13.8
colspan = head_int(attrs_get(b'colspan'))
if colspan == 0:
# Note: colspan="0" tells the browser to span the cell to
# the last column of the column group (colgroup)
# http://www.w3schools.com/TAGS/att_td_colspan.asp
colspan = 1
# 13.9
rowspan = head_int(attrs_get(b'rowspan'))
# 13.10
if rowspan == 0:
# Note: rowspan="0" tells the browser to span the cell to the
# last row of the table.
# http://www.w3schools.com/tags/att_td_rowspan.asp
cell_grows_downward = True
rowspan = 1
else:
cell_grows_downward = False
# 13.11
if xwidth < xcurrent + colspan:
xwidth = xcurrent + colspan
for r in table:
if xwidth > len(r):
r.extend([None] * (xwidth - len(r)))
# 13.12
if yheight < ycurrent + rowspan:
yheight = ycurrent + rowspan
while len(table) < yheight:
append_row([None] * xwidth)
# 13.13
for y in range(ycurrent, ycurrent + rowspan):
r = table[y]
for x in range(xcurrent, xcurrent + colspan):
# If any of the slots involved already had a cell
# covering them, then this is a table model error.
# Those slots now have two cells overlapping.
r[x] = current_cell
# Skipping algorithm for assigning header cells
# 13.14
if cell_grows_downward:
downward_growing_cells.append(
(current_cell, xcurrent, colspan)
)
# 13.15
xcurrent += colspan
# 13.16
ycurrent += 1
# 14
# The algorithm for ending a row group
# 14.1
while ycurrent < yheight:
# 14.1.1
# Run the algorithm for growing downward-growing cells.
for cell, cellx, width in downward_growing_cells:
for x in range(cellx, cellx + width):
table[ycurrent][x] = cell
# 14.2.2
ycurrent += 1
# 14.2
# downward_growing_cells = []
# 20 If there exists a row or column in the table containing only
# slots that do not have a cell anchored to them,
# then this is a table model error.
return table
def _lstrip_increase(shadow: bytearray, pos: int) -> int:
"""Return the new position to lstrip the shadow."""
length = len(shadow)
while pos < length and shadow[pos] in {0, 9, 10, 13, 32}: # \0\t\n\r space
pos += 1
return pos
def _row_separator_increase(shadow: bytearray, pos: int) -> int:
"""Return the position after the starting row separator line.
Also skips any semi-caption lines before and after the separator.
"""
# General format of row separators: r'\|-[^\n]*\n'
ncl = FIRST_NON_CAPTION_LINE(shadow, pos).start()
lsp = _lstrip_increase(shadow, ncl)
while shadow[lsp : lsp + 2] == b'|-':
# We are on a row separator line.
pos = shadow.find(10, lsp + 2) # ord('\n')
pos = FIRST_NON_CAPTION_LINE(shadow, pos).start()
lsp = _lstrip_increase(shadow, pos)
return pos