jenkins-bot has submitted this change and it was merged.
Change subject: Improve timestripper to support more languages ......................................................................
Improve timestripper to support more languages
-Fixed regex of month digit from \d{1,2} to (?:1[012]|0?[1-9]) -Added sign of day, month and year for Korean -Added a fixer to make non_latin_digits lation (languages like fa and ckb)
I tested it on three languages that are working now: ko, fa and ckb.
Change-Id: Iddbccc7cbcf16e77ca334d9a0d434f9e084884eb --- M pywikibot/textlib.py M tests/archivebot_tests.py M tests/timestripper_tests.py 3 files changed, 32 insertions(+), 9 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 9a2bd23..49bcf35 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -33,6 +33,15 @@
TEMP_REGEX = re.compile( '{{(?:msg:)?(?P<name>[^{|]+?)(?:|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?))?}}') +NON_LATIN_DIGITS = [ + u'٠١٢٣٤٥٦٧٨٩', # ckb + u'۰۱۲۳۴۵۶۷۸۹', # fa + u'೦೧೨೩೪೫೬೭೮೯', # kn + u'०१२३४५६७८९', # hi and some other + u'০১২৩৪৫৬৭৮৯', # bn + u'૦૧૨૩૪૫૬૭૮૯', # gu + u'୦୧୨୩୪୫୬୭୮୯', # or +]
def unescape(s): @@ -1198,8 +1207,7 @@
timeR = r'(?P<time>(?P<hour>([0-1]\d|2[0-3]))[:.h](?P<minute>[0-5]\d))' timeznR = r'((?P<tzinfo>[A-Z]+))' - yearR = r'(?P<year>(19|20)\d\d)' - + yearR = r'(?P<year>(19|20)\d\d)(?:%s)?' % u'\ub144' # if months have 'digits' as names, they need to be # removed; will be handled as digits in regex, adding d+{1,2}.? escaped_months = [_ for _ in self.origNames2monthNum if @@ -1207,13 +1215,14 @@ # match longest names first. escaped_months = [re.escape(_) for _ in sorted(escaped_months, reverse=True)] - # work around for cs wiki: if month are in digits, we assume # that format is dd. mm. (with dot and spaces optional) + # the last one is workaround for Korean if any(_.isdigit() for _ in self.origNames2monthNum): self.is_digit_month = True - monthR = r'(?P<month>(%s)|\d{1,2}.?)' % u'|'.join(escaped_months) - dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9])).?\s*[01]?\d.?' + monthR = r'(?P<month>(%s)(?:\u0654)?|(?:1[012]|0?[1-9]).?(?:\uc6d4)?)' \ + % u'|'.join(escaped_months) + dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))(?:%s)?.?\s*[01]?\d.?' % u'\uc77c' else: self.is_digit_month = False monthR = r'(?P<month>(%s))' % u'|'.join(escaped_months) @@ -1222,7 +1231,7 @@ self.ptimeR = re.compile(timeR) self.ptimeznR = re.compile(timeznR) self.pyearR = re.compile(yearR) - self.pmonthR = re.compile(monthR, re.U) + self.pmonthR = re.compile(monthR) self.pdayR = re.compile(dayR)
# order is important to avoid mismatch when searching @@ -1235,10 +1244,17 @@ ]
def findmarker(self, text, base=u'@@', delta='@'): - # find a string which is not part of text + """Find a string which is not part of text.""" while base in text: base += delta return base + + def fix_digits(self, line): + """Make non-latin digits like Persian to latin to parse.""" + for system in NON_LATIN_DIGITS: + for i in range(0, 10): + line = line.replace(system[i], str(i)) + return line
def last_match_and_replace(self, txt, pat): """ @@ -1278,6 +1294,7 @@ """ # match date fields dateDict = dict() + line = self.fix_digits(line) for pat in self.patterns: line, matchDict = self.last_match_and_replace(line, pat) if matchDict: diff --git a/tests/archivebot_tests.py b/tests/archivebot_tests.py index 954c1b7..5c57d54 100644 --- a/tests/archivebot_tests.py +++ b/tests/archivebot_tests.py @@ -70,9 +70,9 @@ self.assertIsInstance(thread.content, basestring) self.assertIsInstance(thread.timestamp, datetime)
- expected_failures = ['ar', 'ckb', 'fa', 'pdc', 'th'] + expected_failures = ['ar', 'pdc', 'th'] # expected failures - should be fixed - # 'ar', 'ckb', 'fa': no digits in date, regex does not match + # 'ar': Uses Arabic acronym for TZ # 'pdc': changed month name setting in wiki over time (?) # in old posts in talk page, February is "Feb.", site message gives # <message name="feb" xml:space="preserve">Han.</message>. diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py index d625c4b..0f265e3 100644 --- a/tests/timestripper_tests.py +++ b/tests/timestripper_tests.py @@ -139,6 +139,12 @@ 'match': u'3 February 2010 19:48 (UTC) 7 February 2010 19:48 (UTC)', 'nomatch': u'3. 2. 2010, 19:48 (UTC) 7. 2. 2010 19:48 (UTC)', }, + 'fawiki': { + 'family': 'wikipedia', + 'code': 'fa', + 'match': u'۳ فوریهٔ ۲۰۱۰، ساعت ۱۹:۴۸ (UTC) ۷ فوریهٔ ۲۰۱۰، ساعت ۱۹:۴۸ (UTC)', + 'nomatch': u'۳ ۲ ۲۰۱۴ ۱۹:۴۸ (UTC) ۷ ۲ ۲۰۱۰ ۱۹:۴۸ (UTC)', + }, 'frwiki': { 'family': 'wikipedia', 'code': 'fr',
pywikibot-commits@lists.wikimedia.org