jenkins-bot has submitted this change and it was merged.
Change subject: Bug 69315 - cs timestamp not supported ......................................................................
Bug 69315 - cs timestamp not supported
cswiki uses the 21. 5. 2014, 17:07 (UTC) timestamp.
Once month is matched, days from 1-12 are wiped out by last_match_and_replace(). Changed last_match_and_replace() to replace only the first N-2 and the last match, leaving the N-1 as day candidate.
Modified also: - fixed a bug in monthR regex: if month name contains a dot, it needs to be escaped. It happened e.g. on wikipedia:no. Also raise Keyerror exception that was masking this bug.
- regex for day to include the dot (not mandatory but improves robustness of the search).
- aligned names of self.timeznR->ptimeznR and self.yearR->pyearR for consistency with naming convention.
- fixed a pep8 error at line 859
Change-Id: Ifee1041cf9762f419c48fc6cb0faa56b84d0bee4 --- M pywikibot/textlib.py M tests/archivebot_tests.py M tests/timestripper_tests.py 3 files changed, 129 insertions(+), 15 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index d0ac855..0b56f68 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -856,7 +856,7 @@ else: sep = config.line_separator # Some people don't like the categories sorted - #catLinks.sort() + # catLinks.sort() return sep.join(catLinks) + config.line_separator
@@ -1181,6 +1181,8 @@ for n, (_long, _short) in enumerate(self.site.months_names, start=1): self.origNames2monthNum[_long] = n self.origNames2monthNum[_short] = n + # in some cases month in ~~~~ might end without dot even if + # site.months_names do not. if _short.endswith('.'): self.origNames2monthNum[_short[:-1]] = n
@@ -1189,20 +1191,22 @@ timeR = r'(?P<time>(?P<hour>[0-2]\d)[:.h](?P<minute>[0-5]\d))' timeznR = r'((?P<tzinfo>[A-Z]+))' yearR = r'(?P<year>(19|20)\d\d)' - monthR = r'(?P<month>(%s))' % (u'|'.join(self.origNames2monthNum)) - dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))' + # if months name contain a dot, it needs to be escaped. + escaped_months = [re.escape(_) for _ in self.origNames2monthNum] + monthR = r'(?P<month>(%s))' % u'|'.join(escaped_months) + dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9])).?'
self.ptimeR = re.compile(timeR) - self.timeznR = re.compile(timeznR) - self.yearR = re.compile(yearR) + self.ptimeznR = re.compile(timeznR) + self.pyearR = re.compile(yearR) self.pmonthR = re.compile(monthR, re.U) self.pdayR = re.compile(dayR)
# order is important to avoid mismatch when searching self.patterns = [ self.ptimeR, - self.timeznR, - self.yearR, + self.ptimeznR, + self.pyearR, self.pmonthR, self.pdayR, ] @@ -1218,12 +1222,21 @@ Take the rightmost match, to prevent spurious earlier matches, and replace with marker """ m = None + cnt = 0 for m in pat.finditer(txt): - pass + cnt += 1
if m: marker = self.findmarker(txt) - txt = pat.sub(marker, txt) + # month and day format might be identical (e.g. see bug 69315), + # avoid to wipe out day, after month is matched. + # replace all matches but the one before last, which is the day candidate. + if pat == self.pmonthR: + txt = pat.sub(marker, txt, cnt - 2) + # matched month needs to be wiped out (last match of txt) + txt = re.sub(r'(.*)%s' % m.group(), r'\1%s' % marker, txt) + else: + txt = pat.sub(marker, txt) return (txt, m.groupdict()) else: return (txt, None) @@ -1241,7 +1254,6 @@ line, matchDict = self.last_match_and_replace(line, pat) if matchDict: dateDict.update(matchDict) - # all fields matched -> date valid if all(g in dateDict for g in self.groups): # remove 'time' key, now splitted in hour/minute and not needed by datetime @@ -1251,14 +1263,19 @@ try: dateDict['month'] = self.origNames2monthNum[dateDict['month']] except KeyError: - pywikibot.output(u'incorrect month name in page') + pywikibot.output(u'incorrect month name "%s" in page in site %s' + % (dateDict['month'], self.site)) + raise KeyError
# convert to integers for k, v in dateDict.items(): + if k == 'tzinfo': + continue try: dateDict[k] = int(v) except ValueError: - pass + raise ValueError('Value: %s could not be converted for key: %s.' + % (v, k))
# find timezone dateDict['tzinfo'] = tzoneFixedOffset(self.site.siteinfo['timeoffset'], diff --git a/tests/archivebot_tests.py b/tests/archivebot_tests.py index fa1d915..8539ef3 100644 --- a/tests/archivebot_tests.py +++ b/tests/archivebot_tests.py @@ -73,9 +73,14 @@ for code in THREADS: test_name = "test_wikipedia_" + code
- if code in ['ar', 'ckb', 'en', 'fa', 'frr', 'no', 'pdc', 'pt', 'th', - 'ug']: + if code in ['ar', 'ckb', 'fa', 'pdc', 'th']: # expected failures - should be fixed + # 'ar', 'ckb', 'fa': no digits in date, regex does not match + # 'pdc': changed month name setting in wiki over time (?) + # in old posts in talk page, February is "Feb.", site message gives + # <message name="feb" xml:space="preserve">Han.</message>. + # for new entries it should work + # 'th': year is 2552 while regex assumes 19..|20.., might be fixed dct[test_name] = unittest.expectedFailure(test_method(code)) else: dct[test_name] = test_method(code) diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py index 79a1878..1b3f171 100644 --- a/tests/timestripper_tests.py +++ b/tests/timestripper_tests.py @@ -36,7 +36,7 @@
txtWithMatch = u'this string has one 1998, 1999 and 3000 in it' txtWithNoMatch = u'this string has no match' - pat = self.ts.yearR + pat = self.ts.pyearR
self.assertEqual(self.ts.last_match_and_replace(txtWithMatch, pat), (u'this string has one @@, @@ and 3000 in it', @@ -62,6 +62,98 @@ self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+class TestEnglishTimeStripper(PywikibotTestCase): + """Test cases for Link objects""" + + def setUp(self): + site = pywikibot.Site('en', 'wikipedia') + self.ts = TimeStripper(site) + super(TestEnglishTimeStripper, self).setUp() + + def test_timestripper(self): + """Test that correct date is matched""" + + txtMatch = u'3 February 2010 19:48 (UTC) 7 February 2010 19:48 (UTC)' + txtNoMatch = u'3. 2. 2010, 19:48 (UTC) 7. 2. 2010 19:48 (UTC)' + + tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'], + self.ts.site.siteinfo['timezone']) + + res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone) + + self.assertEqual(self.ts.timestripper(txtMatch), res) + self.assertEqual(self.ts.timestripper(txtNoMatch), None) + + +class TestCzechTimeStripper(PywikibotTestCase): + """Test cases for Link objects""" + + def setUp(self): + site = pywikibot.Site('cs', 'wikipedia') + self.ts = TimeStripper(site) + super(TestCzechTimeStripper, self).setUp() + + def test_timestripper(self): + """Test that correct date is matched""" + + txtMatch = u'3. 2. 2010, 19:48 (UTC) 7. 2. 2010 19:48 (UTC)' + txtNoMatch = u'3 March 2010 19:48 (UTC) 7 March 2010 19:48 (UTC)' + + tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'], + self.ts.site.siteinfo['timezone']) + + res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone) + + self.assertEqual(self.ts.timestripper(txtMatch), res) + self.assertEqual(self.ts.timestripper(txtNoMatch), None) + + +class TestPortugueseTimeStripper(PywikibotTestCase): + """Test cases for Link objects""" + + def setUp(self): + site = pywikibot.Site('pt', 'wikipedia') + self.ts = TimeStripper(site) + super(TestPortugueseTimeStripper, self).setUp() + + def test_timestripper(self): + """Test that correct date is matched""" + + txtMatch = u'19h48min de 3 de fevereiro de 2010 (UTC) 19h48min de 7 de fevereiro de 2010 (UTC)' + txtNoMatch = u'3 March 2010 19:48 (UTC) 7 March 2010 19:48 (UTC)' + + tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'], + self.ts.site.siteinfo['timezone']) + + res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone) + + self.assertEqual(self.ts.timestripper(txtMatch), res) + self.assertEqual(self.ts.timestripper(txtNoMatch), None) + + +class TestNorwegianTimeStripper(PywikibotTestCase): + """Test cases for Link objects""" + + def setUp(self): + site = pywikibot.Site('no', 'wikipedia') + self.ts = TimeStripper(site) + super(TestNorwegianTimeStripper, self).setUp() + + def test_timestripper(self): + """Test that correct date is matched""" + + txtMatch = u'3. feb 2010 kl. 19:48 (CET) 7. feb 2010 kl. 19:48 (UTC)' + txtNoMatch = u'3 March 2010 19:48 (UTC) 7 March 2010 19:48 (UTC)' + + tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'], + self.ts.site.siteinfo['timezone']) + + res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone) + + self.assertEqual(self.ts.timestripper(txtMatch), res) + self.assertEqual(self.ts.timestripper(txtNoMatch), None) + + if __name__ == '__main__': try: unittest.main()