jenkins-bot has submitted this change and it was merged.
Change subject: timestripper: prevent recognizing components too far from each other ......................................................................
timestripper: prevent recognizing components too far from each other
timestripper should not be too flexible about the locations of the components of a timestamp. The added test demonstrates a false positive, incorrectly recognized as a timestamp.
This patch places a limit to the distance between neighboring components of a timestamp. Tentatively the limit is set to 10.
Change-Id: I8ef86e21f08248d6abb7d1b78252029d2ce0c017 --- M pywikibot/textlib.py M tests/timestripper_tests.py 2 files changed, 44 insertions(+), 14 deletions(-)
Approvals: Dalba: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index fc7c4b1..5a0e80e 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -128,6 +128,11 @@ 'or': u'୦୧୨୩୪୫୬୭୮୯', }
+# Used in TimeStripper. When a timestamp-like line have longer gaps +# than this between year, month, etc in it, then the line will not be +# considered to contain a timestamp. +TIMESTAMP_GAP_LIMIT = 10 +
def to_local_digits(phrase, lang): """ @@ -1972,17 +1977,19 @@ return (txt, None)
@staticmethod - def _valid_date_dict_order(dateDict): + def _valid_date_dict_positions(dateDict): """Check consistency of reasonable positions for groups.""" - day_pos = dateDict['day']['pos'] - month_pos = dateDict['month']['pos'] - year_pos = dateDict['year']['pos'] - time_pos = dateDict['time']['pos'] - tzinfo_pos = dateDict['tzinfo']['pos'] + time_pos = dateDict['time']['start'] + tzinfo_pos = dateDict['tzinfo']['start'] + date_pos = sorted( + (dateDict['day'], dateDict['month'], dateDict['year']), + key=lambda x: x['start']) + min_pos, max_pos = date_pos[0]['start'], date_pos[-1]['start'] + max_gap = max(x[1]['start'] - x[0]['end'] + for x in zip(date_pos, date_pos[1:]))
- date_pos = sorted((day_pos, month_pos, year_pos)) - min_pos, max_pos = date_pos[0], date_pos[-1] - + if max_gap > TIMESTAMP_GAP_LIMIT: + return False if tzinfo_pos < min_pos or tzinfo_pos < time_pos: return False if min_pos < tzinfo_pos < max_pos: @@ -2023,15 +2030,16 @@ line, match_obj = self._last_match_and_replace(line, pat) if match_obj: for group, value in match_obj.groupdict().items(): - pos = match_obj.start(group) - # Store also match pos in line, for later order check. - matchDict = {group: {'value': value, 'pos': pos}} - dateDict.update(matchDict) + start, end = (match_obj.start(group), match_obj.end(group)) + # The positions are stored for later validation + dateDict[group] = { + 'value': value, 'start': start, 'end': end + }
# all fields matched -> date valid # groups are in a reasonable order. if (all(g in dateDict for g in self.groups) and - self._valid_date_dict_order(dateDict)): + self._valid_date_dict_positions(dateDict)): # remove 'time' key, now split in hour/minute and not needed # by datetime. del dateDict['time'] diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py index 90eca23..1fdd938 100644 --- a/tests/timestripper_tests.py +++ b/tests/timestripper_tests.py @@ -142,6 +142,28 @@ )
+class TestTimeStripperNumberAndDate(TestTimeStripperCase): + + """Test cases for lines with (non-year) numbers and timestamps.""" + + family = 'wikipedia' + code = 'en' + + def test_four_digit_is_not_year_with_no_timestamp(self): + """A 4-digit number should not be mistaken as year (w/o timestamp).""" + self.assertIsNone( + self.ts.timestripper( + '2000 people will meet on 16 December at 22:00 (UTC).')) + + def test_four_digit_is_not_year_with_timestamp(self): + """A 4-digit number should not be mistaken as year (w/ timestamp).""" + self.assertEqual( + self.ts.timestripper( + '2000 people will attend. --12:12, 14 December 2015 (UTC)'), + datetime.datetime( + 2015, 12, 14, 12, 12, tzinfo=tzoneFixedOffset(0, 'UTC'))) + + class TestTimeStripperLanguage(TestCase):
"""Test cases for English language."""