jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/337293 )
Change subject: timestripper: search wikilinks to reduce false matches ......................................................................
timestripper: search wikilinks to reduce false matches
Search timestamp in wikilinks as well.
After searching, wikilinks are censored, so possible digits in wikilinks will not cause false matches, while allowing to maintain gaps (used in _valid_date_dict_positions()).
Hyperlinks are censored as well, to avoid false matches.
Deprecated: - Timestripper.linkP, in favour of an internal variable. - Timestripper.comment_pattern, in favour of an internal variable.
Added tests for comments, wikilinks and hyperlinks.
Change-Id: I6fb15b2eed6845e57b31042f9312890025b1b7c7 --- M pywikibot/textlib.py M tests/timestripper_tests.py 2 files changed, 154 insertions(+), 5 deletions(-)
Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 57c338a..75a68c6 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -7,7 +7,7 @@
""" # -# (C) Pywikibot team, 2008-2016 +# (C) Pywikibot team, 2008-2017 # # Distributed under the terms of the MIT license. # @@ -1927,11 +1927,25 @@ self.pdayR, ]
- self.linkP = compileLinkR() - self.comment_pattern = re.compile(r'<!--(.*?)-->') + self._hyperlink_pat = re.compile(r'[\s*?http[s]?://[^]]*?]') + self._comment_pat = re.compile(r'<!--(.*?)-->') + self._wikilink_pat = re.compile( + r'[[(?P<link>[^]|]*?)(?P<anchor>|[^]]*)?]]')
self.tzinfo = tzoneFixedOffset(self.site.siteinfo['timeoffset'], self.site.siteinfo['timezone']) + + @property + @deprecated('_hyperlink_pat') + def linkP(self): + """Deprecated linkP instance variable.""" + return self._hyperlink_pat + + @property + @deprecated('_comment_pat') + def comment_pattern(self): + """Deprecated comment_pattern instance variable.""" + return self._comment_pat
@deprecated('module function') def findmarker(self, text, base=u'@@', delta='@'): @@ -2014,23 +2028,47 @@ @return: A timestamp found on the given line @rtype: pywikibot.Timestamp """ + # Try to maintain gaps that are used in _valid_date_dict_positions() + def censor_match(match): + return '_' * (match.end() - match.start()) + # match date fields dateDict = dict() + # Analyze comments separately from rest of each line to avoid to skip # dates in comments, as the date matched by timestripper is the # rightmost one. most_recent = [] - for comment in self.comment_pattern.finditer(line): + for comment in self._comment_pat.finditer(line): # Recursion levels can be maximum two. If a comment is found, it will # not for sure be found in the next level. # Nested comments are excluded by design. timestamp = self.timestripper(comment.group(1)) most_recent.append(timestamp)
+ # Censor comments. + line = self._comment_pat.sub(censor_match, line) + + # Censor external links. + line = self._hyperlink_pat.sub(censor_match, line) + + for wikilink in self._wikilink_pat.finditer(line): + # Recursion levels can be maximum two. If a link is found, it will + # not for sure be found in the next level. + # Nested links are excluded by design. + link, anchor = wikilink.group('link'), wikilink.group('anchor') + timestamp = self.timestripper(link) + most_recent.append(timestamp) + if anchor: + timestamp = self.timestripper(anchor) + most_recent.append(timestamp) + + # Censor wikilinks. + line = self._wikilink_pat.sub(censor_match, line) + # Remove parts that are not supposed to contain the timestamp, in order # to reduce false positives. line = removeDisabledParts(line) - line = self.linkP.sub('', line) # remove external links
line = self.fix_digits(line) for pat in self.patterns: diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py index 125f7c1..ec2d0b5 100644 --- a/tests/timestripper_tests.py +++ b/tests/timestripper_tests.py @@ -272,6 +272,117 @@ self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+class TestTimeStripperTreatSpecialText(TestTimeStripperCase): + + """Test special text behaviour (comments, hyperlinks, wikilinks).""" + + family = 'wikisource' + code = 'en' + + date = '06:57 06 June 2015 (UTC)' + fake_date = '05:57 06 June 2015 (UTC)' + tzone = tzoneFixedOffset(0, 'UTC') + expected_date = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=tzone) + + def test_timestripper_match_comment(self): + """Test that comments are correctly matched.""" + ts = self.ts + + txt_match = self.date + '<!--a test comment-->' + exp_match = 'a test comment' + self.assertEqual(ts._comment_pat.search(txt_match).group(1), + exp_match) + + def test_timestripper_match_hyperlink(self): + """Test that hyperlinks are correctly matched.""" + ts = self.ts + + txt_match = '[http://test.org | a link]' + exp_match = '[http://test.org | a link]' + self.assertEqual(ts._hyperlink_pat.search(txt_match).group(), + exp_match) + + def test_timestripper_match_wikilink(self): + """Test that wikilinks are correctly matched.""" + ts = self.ts + + txt_match = '[[wikilink|a wikilink with no date]]' + exp_match_link = 'wikilink' + exp_match_anchor = '|a wikilink with no date' + self.assertEqual(ts._wikilink_pat.search(txt_match).group('link'), + exp_match_link) + self.assertEqual(ts._wikilink_pat.search(txt_match).group('anchor'), + exp_match_anchor) + + def test_timestripper_match_comment_with_date(self): + """Test that dates in comments are correctly matched.""" + ts = self.ts.timestripper + + txt_match = self.date + '<!--' + self.fake_date + '-->' + self.assertEqual(ts(txt_match), self.expected_date) + + txt_match = '<!--' + self.fake_date + '-->' + self.date + self.assertEqual(ts(txt_match), self.expected_date) + + txt_match = '<!--' + self.date + '-->' + self.fake_date + self.assertEqual(ts(txt_match), self.expected_date) + + txt_match = '<!--comment|' + self.date + '-->' + self.fake_date + self.assertEqual(ts(txt_match), self.expected_date) + + def test_timestripper_skip_hyperlink(self): + """Test that dates in hyperlinks are correctly skipped.""" + ts = self.ts.timestripper + + txt_match = self.date + '[http://' + self.fake_date + ']' + self.assertEqual(ts(txt_match), self.expected_date) + + txt_match = '[http://' + self.fake_date + ']' + self.date + self.assertEqual(ts(txt_match), self.expected_date) + + txt_match = ('%s [http://www.org | link with date %s]' + % (self.date, self.fake_date)) + self.assertEqual(ts(txt_match), self.expected_date) + + txt_match = '[http://' + self.fake_date + ']' + self.date + self.assertEqual(ts(txt_match), self.expected_date) + + def test_timestripper_skip_hyperlink_and_do_not_connect(self): + """Test that skipping hyperlinks will not make gaps shorter.""" + ts = self.ts.timestripper + + txt_match = ('%s[http://example.com Here is long enough text]%s' + % (self.date[:9], self.date[9:])) + self.assertEqual(ts(txt_match), None) + + def test_timestripper_match_wikilink_with_date(self): + """Test that dates in wikilinks are correctly matched.""" + ts = self.ts.timestripper + + txt_match = self.date + '[[' + self.fake_date + ']]' + self.assertEqual(ts(txt_match), self.expected_date) + + txt_match = '[[' + self.fake_date + ']]' + self.date + self.assertEqual(ts(txt_match), self.expected_date) + + txt_match = '[[' + self.date + ']]' + self.fake_date + self.assertEqual(ts(txt_match), self.expected_date) + + txt_match = '[[wikilink|' + self.date + ']]' + self.fake_date + self.assertEqual(ts(txt_match), self.expected_date) + + def test_timestripper_skip_wikilink_and_do_not_connect(self): + """Test that skipping wikilinks will not make gaps shorter.""" + ts = self.ts.timestripper + + txt_match = ('%s[[Here is long enough text]]%s' + % (self.date[:9], self.date[9:])) + self.assertEqual(ts(txt_match), None) + + txt_match = self.date[:9] + '[[foo]]' + self.date[9:] + self.assertEqual(ts(txt_match), self.expected_date) + + class TestTimeStripperDoNotArchiveUntil(TestTimeStripperCase):
"""Test cases for Do Not Archive Until templates.