jenkins-bot has submitted this change and it was merged.
Change subject: Don't ignore "DoNotArchiveUntil" timestamps ......................................................................
Don't ignore "DoNotArchiveUntil" timestamps
Don't ignore timestamps written in HTML comments, e.g. as used by "DoNotArchiveUntil".
See: - https://commons.wikimedia.org/wiki/Template:DNAU - https://en.wikipedia.org/wiki/Template:Do_not_archive_until
Analyze comments separately from rest of each line to avoid to skip dates in comments, as the date matched by timestripper is the rightmost one.
Bug: T102423 Change-Id: I079d9f6b636ac0a145dd04a3190a65c61b9d1b31 --- M pywikibot/textlib.py M tests/timestripper_tests.py 2 files changed, 72 insertions(+), 3 deletions(-)
Approvals: Merlijn van Deen: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 26b291f..a1d2c55 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -1394,6 +1394,10 @@ ]
self.linkP = compileLinkR() + self.comment_pattern = re.compile(r'<!--(.*?)-->') + + self.tzinfo = tzoneFixedOffset(self.site.siteinfo['timeoffset'], + self.site.siteinfo['timezone'])
def findmarker(self, text, base=u'@@', delta='@'): """Find a string which is not part of text.""" @@ -1446,6 +1450,17 @@ """ # match date fields dateDict = dict() + # Analyze comments separately from rest of each line to avoid to skip + # dates in comments, as the date matched by timestripper is the + # rightmost one. + most_recent = [] + for comment in self.comment_pattern.finditer(line): + # Recursion levels can be maximum two. If a comment is found, it will + # not for sure be found in the next level. + # Nested cmments are excluded by design. + timestamp = self.timestripper(comment.group(1)) + most_recent.append(timestamp) + # Remove parts that are not supposed to contain the timestamp, in order # to reduce false positives. line = removeDisabledParts(line) @@ -1481,12 +1496,17 @@ % (v, k))
# find timezone - dateDict['tzinfo'] = tzoneFixedOffset(self.site.siteinfo['timeoffset'], - self.site.siteinfo['timezone']) + dateDict['tzinfo'] = self.tzinfo
timestamp = datetime.datetime(**dateDict) - else: timestamp = None
+ most_recent.append(timestamp) + + try: + timestamp = max(ts for ts in most_recent if ts is not None) + except ValueError: + timestamp = None + return timestamp diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py index 80d70e0..a3f7e77 100644 --- a/tests/timestripper_tests.py +++ b/tests/timestripper_tests.py @@ -231,6 +231,55 @@ self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+class TestTimeStripperDoNotArchiveUntil(TestCase): + + """Test cases for Do Not Archive Until templates. + + See https://commons.wikimedia.org/wiki/Template:DNAU and + https://en.wikipedia.org/wiki/Template:Do_not_archive_until. + """ + + family = 'wikisource' + code = 'en' + + cached = True + + username = '[[User:DoNotArchiveUntil]]' + date = '06:57 06 June 2015 (UTC)' + user_and_date = username + ' ' + date + tzone = tzoneFixedOffset(0, 'UTC') + + def test_timestripper_match(self): + """Test that dates in comments are correctly recognised.""" + ts = TimeStripper(self.get_site()) + + txt_match = '<!-- [[User:Do___ArchiveUntil]] ' + self.date + ' -->' + res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + txt_match = '<!-- --> <!-- ' + self.user_and_date + ' <!-- -->' + res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + txt_match = '<!-- ' + self.user_and_date + ' -->' + res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + def test_timestripper_match_only(self): + """Test that latest date is used instead of other dates.""" + ts = TimeStripper(self.get_site()) + + later_date = '10:57 06 June 2015 (UTC)' + txt_match = '<!-- --> ' + self.user_and_date + ' <!-- -->' + later_date + res = datetime.datetime(2015, 6, 6, 10, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + earlier_date = '02:57 06 June 2015 (UTC)' + txt_match = '<!-- ' + self.user_and_date + ' --> ' + earlier_date + res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + if __name__ == '__main__': try: unittest.main()