jenkins-bot has submitted this change and it was merged.
Change subject: Improve timestripper to support more languages
......................................................................
Improve timestripper to support more languages
-Fixed regex of month digit from \d{1,2} to (?:1[012]|0?[1-9])
-Added sign of day, month and year for Korean
-Added a fixer to make non_latin_digits lation (languages like fa and ckb)
I tested it on three languages that are working now: ko, fa and ckb.
Change-Id: Iddbccc7cbcf16e77ca334d9a0d434f9e084884eb
---
M pywikibot/textlib.py
M tests/archivebot_tests.py
M tests/timestripper_tests.py
3 files changed, 32 insertions(+), 9 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 9a2bd23..49bcf35 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -33,6 +33,15 @@
TEMP_REGEX = re.compile(
'{{(?:msg:)?(?P<name>[^{\|]+?)(?:\|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?))?}}')
+NON_LATIN_DIGITS = [
+ u'٠١٢٣٤٥٦٧٨٩', # ckb
+ u'۰۱۲۳۴۵۶۷۸۹', # fa
+ u'೦೧೨೩೪೫೬೭೮೯', # kn
+ u'०१२३४५६७८९', # hi and some other
+ u'০১২৩৪৫৬৭৮৯', # bn
+ u'૦૧૨૩૪૫૬૭૮૯', # gu
+ u'୦୧୨୩୪୫୬୭୮୯', # or
+]
def unescape(s):
@@ -1198,8 +1207,7 @@
timeR =
r'(?P<time>(?P<hour>([0-1]\d|2[0-3]))[:\.h](?P<minute>[0-5]\d))'
timeznR = r'\((?P<tzinfo>[A-Z]+)\)'
- yearR = r'(?P<year>(19|20)\d\d)'
-
+ yearR = r'(?P<year>(19|20)\d\d)(?:%s)?' % u'\ub144'
# if months have 'digits' as names, they need to be
# removed; will be handled as digits in regex, adding d+{1,2}\.?
escaped_months = [_ for _ in self.origNames2monthNum if
@@ -1207,13 +1215,14 @@
# match longest names first.
escaped_months = [re.escape(_) for
_ in sorted(escaped_months, reverse=True)]
-
# work around for cs wiki: if month are in digits, we assume
# that format is dd. mm. (with dot and spaces optional)
+ # the last one is workaround for Korean
if any(_.isdigit() for _ in self.origNames2monthNum):
self.is_digit_month = True
- monthR = r'(?P<month>(%s)|\d{1,2}\.?)' %
u'|'.join(escaped_months)
- dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))\.?\s*[01]?\d\.?'
+ monthR =
r'(?P<month>(%s)(?:\u0654)?|(?:1[012]|0?[1-9])\.?(?:\uc6d4)?)' \
+ % u'|'.join(escaped_months)
+ dayR =
r'(?P<day>(3[01]|[12]\d|0?[1-9]))(?:%s)?\.?\s*[01]?\d\.?' %
u'\uc77c'
else:
self.is_digit_month = False
monthR = r'(?P<month>(%s))' %
u'|'.join(escaped_months)
@@ -1222,7 +1231,7 @@
self.ptimeR = re.compile(timeR)
self.ptimeznR = re.compile(timeznR)
self.pyearR = re.compile(yearR)
- self.pmonthR = re.compile(monthR, re.U)
+ self.pmonthR = re.compile(monthR)
self.pdayR = re.compile(dayR)
# order is important to avoid mismatch when searching
@@ -1235,10 +1244,17 @@
]
def findmarker(self, text, base=u'@@', delta='@'):
- # find a string which is not part of text
+ """Find a string which is not part of text."""
while base in text:
base += delta
return base
+
+ def fix_digits(self, line):
+ """Make non-latin digits like Persian to latin to
parse."""
+ for system in NON_LATIN_DIGITS:
+ for i in range(0, 10):
+ line = line.replace(system[i], str(i))
+ return line
def last_match_and_replace(self, txt, pat):
"""
@@ -1278,6 +1294,7 @@
"""
# match date fields
dateDict = dict()
+ line = self.fix_digits(line)
for pat in self.patterns:
line, matchDict = self.last_match_and_replace(line, pat)
if matchDict:
diff --git a/tests/archivebot_tests.py b/tests/archivebot_tests.py
index 954c1b7..5c57d54 100644
--- a/tests/archivebot_tests.py
+++ b/tests/archivebot_tests.py
@@ -70,9 +70,9 @@
self.assertIsInstance(thread.content, basestring)
self.assertIsInstance(thread.timestamp, datetime)
- expected_failures = ['ar', 'ckb', 'fa', 'pdc',
'th']
+ expected_failures = ['ar', 'pdc', 'th']
# expected failures - should be fixed
- # 'ar', 'ckb', 'fa': no digits in date, regex does not match
+ # 'ar': Uses Arabic acronym for TZ
# 'pdc': changed month name setting in wiki over time (?)
# in old posts in talk page, February is "Feb.", site message gives
# <message name="feb"
xml:space="preserve">Han.</message>.
diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py
index d625c4b..0f265e3 100644
--- a/tests/timestripper_tests.py
+++ b/tests/timestripper_tests.py
@@ -139,6 +139,12 @@
'match': u'3 February 2010 19:48 (UTC) 7 February 2010 19:48
(UTC)',
'nomatch': u'3. 2. 2010, 19:48 (UTC) 7. 2. 2010 19:48
(UTC)',
},
+ 'fawiki': {
+ 'family': 'wikipedia',
+ 'code': 'fa',
+ 'match': u'۳ فوریهٔ ۲۰۱۰، ساعت ۱۹:۴۸ (UTC) ۷ فوریهٔ ۲۰۱۰، ساعت
۱۹:۴۸ (UTC)',
+ 'nomatch': u'۳ ۲ ۲۰۱۴ ۱۹:۴۸ (UTC) ۷ ۲ ۲۰۱۰ ۱۹:۴۸ (UTC)',
+ },
'frwiki': {
'family': 'wikipedia',
'code': 'fr',
--
To view, visit
https://gerrit.wikimedia.org/r/161256
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Iddbccc7cbcf16e77ca334d9a0d434f9e084884eb
Gerrit-PatchSet: 9
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Revi <gerrit(a)revi.pe.kr>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>