jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/863288 )
Change subject: [cleanup] Improvements for TimeStripper ......................................................................
[cleanup] Improvements for TimeStripper
- replace self.patterns by a TimeStripperPatterns - deprecate single regex patterns attributes - remove self.groups and introduce TIMEGROUPS constant - reduce nested flow statements in _last_match_and_replace - raise KexError with a message instead printing the message in _valid_date_dict_positions - use f-strings instead of format method - add a usage sample for TimeStripper
Change-Id: I1e01642e4bd94e998d4480c99e249884cd5cfdac --- M pywikibot/textlib.py 1 file changed, 137 insertions(+), 47 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 8b252a8..9543fc8 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -1832,9 +1832,32 @@ # Time parsing functionality (Archivebot) # ---------------------------------------
+TIMEGROUPS = ('time', 'tzinfo', 'year', 'month', 'day', 'hour', 'minute') + +#: Hold precompiled timestamp patterns for :class:`TimeStripper`. +#: Order of TimeStripperPatterns is important to avoid mismatch when searching. +#: +#: .. versionadded:: 8.0 +TimeStripperPatterns = namedtuple('TimeStripperPatterns', TIMEGROUPS[:-2]) + + class TimeStripper:
- """Find timestamp in page and return it as pywikibot.Timestamp object.""" + """Find timestamp in page and return it as pywikibot.Timestamp object. + + .. versionchanged:: 8.0 + *group* attribute is a set instead of a list. + *patterns* is a :class:`TimeStripperPatterns` namedtuple instead + of a list. + + **Example**: + + >>> site = pywikibot.Site('wikipedia:fr') + >>> sign = 'Merci bien Xqt (d) 15 mai 2013 à 20:34 (CEST)' + >>> ts = TimeStripper(site) + >>> ts.timestripper(sign) + Timestamp(2013, 5, 15, 20, 34, tzinfo=TZoneFixedOffset(3600, Europe/Paris)) + """
def __init__(self, site=None) -> None: """Initializer.""" @@ -1854,24 +1877,21 @@ if short.endswith('.'): self.origNames2monthNum[func(short[:-1])] = n
- self.groups = ['year', 'month', 'hour', 'time', 'day', 'minute', - 'tzinfo'] - timeR = (r'(?P<time>(?P<hour>([0-1]\d|2[0-3]))[:.h]' r'(?P<minute>[0-5]\d))') timeznR = r'((?P<tzinfo>[A-Z]+))' yearR = r'(?P<year>(19|20)\d\d)(?:{})?'.format('\ub144') # if months have 'digits' as names, they need to be # removed; will be handled as digits in regex, adding d+{1,2}.? - escaped_months = [_ for _ in self.origNames2monthNum if - not _.strip('.').isdigit()] + escaped_months = [month for month in self.origNames2monthNum if + not month.strip('.').isdigit()] # match longest names first. - escaped_months = [re.escape(_) for - _ in sorted(escaped_months, reverse=True)] + escaped_months = [re.escape(month) for + month in sorted(escaped_months, reverse=True)] # work around for cs wiki: if month are in digits, we assume # that format is dd. mm. (with dot and spaces optional) # the last one is workaround for Korean - if any(_.isdigit() for _ in self.origNames2monthNum): + if any(month.isdigit() for month in self.origNames2monthNum): self.is_digit_month = True monthR = r'(?P<month>({})|(?:1[012]|0?[1-9]).)' \ .format('|'.join(escaped_months)) @@ -1882,20 +1902,13 @@ monthR = r'(?P<month>({}))'.format('|'.join(escaped_months)) dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9])).?'
- self.ptimeR = re.compile(timeR) - self.ptimeznR = re.compile(timeznR) - self.pyearR = re.compile(yearR) - self.pmonthR = re.compile(monthR) - self.pdayR = re.compile(dayR) - - # order is important to avoid mismatch when searching - self.patterns = [ - self.ptimeR, - self.ptimeznR, - self.pyearR, - self.pmonthR, - self.pdayR, - ] + self.patterns = TimeStripperPatterns( + re.compile(timeR), + re.compile(timeznR), + re.compile(yearR), + re.compile(monthR), + re.compile(dayR), + )
self._hyperlink_pat = re.compile(r'[\s*?http[s]?://[^]]*?]') self._comment_pat = re.compile(r'<!--(.*?)-->') @@ -1905,6 +1918,66 @@ self.tzinfo = TZoneFixedOffset(self.site.siteinfo['timeoffset'], self.site.siteinfo['timezone'])
+ @property + @deprecated('patterns.time', since='8.0.0') + def ptimeR(self): + """Deprecated time pattern attribute. + + .. deprecated:: 8.0 + use pattern.time instead + """ + return self.patterns.time + + @property + @deprecated('patterns.tzinfo', since='8.0.0') + def ptimeznR(self): + """Deprecated tzinfo pattern attribute. + + .. deprecated:: 8.0 + use patterns.tzinfo instead + """ + return self.patterns.tzinfo + + @property + @deprecated('patterns.year', since='8.0.0') + def pyearR(self): + """Deprecated year pattern attribute. + + .. deprecated:: 8.0 + use patterns.year instead + """ + return self.patterns.year + + @property + @deprecated('patterns.month', since='8.0.0') + def pmonthR(self): + """Deprecated month pattern attribute. + + .. deprecated:: 8.0 + use patterns.month instead + """ + return self.patterns.month + + @property + @deprecated('patterns.day', since='8.0.0') + def pdayR(self): + """Deprecated day pattern attribute. + + .. deprecated:: 8.0 + use patterns.day instead + """ + return self.patterns.day + + @property + @deprecated('textlib.TIMEGROUPS', since='8.0.0') + def groups(self): + """Deprecated groups attribute. + + .. deprecated:: 8.0 + use textlib.TIMEGROUPS instead + """ + return TIMEGROUPS + @staticmethod @deprecated('to_latin_digits() function', since='7.0.0') def fix_digits(line): @@ -1916,8 +1989,7 @@ return to_latin_digits(line)
def _last_match_and_replace(self, txt: str, pat): - """ - Take the rightmost match and replace with marker. + """Take the rightmost match and replace with marker.
It does so to prevent spurious earlier matches. """ @@ -1936,21 +2008,21 @@ """ return '@' * (m.end() - m.start())
- if m: - # month and day format might be identical (e.g. see bug T71315), - # avoid to wipe out day, after month is matched. - # replace all matches but the last two - # (i.e. allow to search for dd. mm.) - if pat == self.pmonthR: - if self.is_digit_month: - if cnt > 2: - txt = pat.sub(marker, txt, cnt - 2) - else: - txt = pat.sub(marker, txt) - else: - txt = pat.sub(marker, txt) - return (txt, m) - return (txt, None) + if not m: + return (txt, None) + + # month and day format might be identical (e.g. see bug T71315), + # avoid to wipe out day, after month is matched. Replace all matches + # but the last two (i.e. allow to search for dd. mm.) + if pat != self.patterns.month: + txt = pat.sub(marker, txt) + elif self.is_digit_month: + if cnt > 2: + txt = pat.sub(marker, txt, cnt - 2) + else: + txt = pat.sub(marker, txt) + + return (txt, m)
@staticmethod def _valid_date_dict_positions(dateDict) -> bool: @@ -2042,7 +2114,7 @@
# all fields matched -> date valid # groups are in a reasonable order. - if (all(g in dateDict for g in self.groups) + if (all(g in dateDict for g in TIMEGROUPS) and self._valid_date_dict_positions(dateDict)): # remove 'time' key, now split in hour/minute and not needed # by datetime. @@ -2052,9 +2124,10 @@ try: value = self.origNames2monthNum[dateDict['month']['value']] except KeyError: - pywikibot.info('incorrect month name "{}" in page in site {}' - .format(dateDict['month']['value'], self.site)) - raise KeyError + raise KeyError( + f"incorrect month name {dateDict['month']['value']!r} " + f'in page in site {self.site}' + ) else: dateDict['month']['value'] = value
@@ -2065,9 +2138,8 @@ try: dateDict[k] = int(v['value']) except ValueError: - raise ValueError( - 'Value: {} could not be converted for key: {}.' - .format(v['value'], k)) + raise ValueError(f"Value: {v['value']} could not be " + f'converted for key: {k}.')
# find timezone dateDict['tzinfo'] = self.tzinfo