jenkins-bot has submitted this change and it was merged.
Change subject: timestripper: make algorithm more robust
......................................................................
timestripper: make algorithm more robust
Make algorithm more robust by checking that matched time and date fields
are also positioned in a reasonable order.
Bug: T131357
Change-Id: Id237912710cfd767e6d0638eabb7819a3dd8519b
---
M pywikibot/textlib.py
M tests/archivebot_tests.py
M tests/timestripper_tests.py
3 files changed, 165 insertions(+), 67 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index d379203..4492d8c 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -1932,7 +1932,7 @@
line = line.replace(system[i], str(i))
return line
- def last_match_and_replace(self, txt, pat):
+ def _last_match_and_replace(self, txt, pat):
"""
Take the rightmost match and replace with marker.
@@ -1943,8 +1943,17 @@
for m in pat.finditer(txt):
cnt += 1
+ def marker(m):
+ """
+ Replace exactly the same number of matched characters.
+
+ Same number of chars shall be replaced, in order to be able to
+ compare pos for matches reliably (absolute pos of a match
+ is not altered by replacement).
+ """
+ return '@' * (m.end() - m.start())
+
if m:
- marker = findmarker(txt)
# month and day format might be identical (e.g. see bug T71315),
# avoid to wipe out day, after month is matched.
# replace all matches but the last two
@@ -1957,9 +1966,29 @@
txt = pat.sub(marker, txt)
else:
txt = pat.sub(marker, txt)
- return (txt, m.groupdict())
+ return (txt, m)
else:
return (txt, None)
+
+ @staticmethod
+ def _valid_date_dict_order(dateDict):
+ """Check consistency of reasonable positions for groups."""
+ day_pos = dateDict['day']['pos']
+ month_pos = dateDict['month']['pos']
+ year_pos = dateDict['year']['pos']
+ time_pos = dateDict['time']['pos']
+ tzinfo_pos = dateDict['tzinfo']['pos']
+
+ date_pos = sorted((day_pos, month_pos, year_pos))
+ min_pos, max_pos = date_pos[0], date_pos[-1]
+
+ if tzinfo_pos < min_pos or tzinfo_pos < time_pos:
+ return False
+ if min_pos < tzinfo_pos < max_pos:
+ return False
+ if min_pos < time_pos < max_pos:
+ return False
+ return True
def timestripper(self, line):
"""
@@ -1979,7 +2008,7 @@
for comment in self.comment_pattern.finditer(line):
# Recursion levels can be maximum two. If a comment is found, it will
# not for sure be found in the next level.
- # Nested cmments are excluded by design.
+ # Nested comments are excluded by design.
timestamp = self.timestripper(comment.group(1))
most_recent.append(timestamp)
@@ -1990,32 +2019,41 @@
line = self.fix_digits(line)
for pat in self.patterns:
- line, matchDict = self.last_match_and_replace(line, pat)
- if matchDict:
- dateDict.update(matchDict)
+ line, match_obj = self._last_match_and_replace(line, pat)
+ if match_obj:
+ for group, value in match_obj.groupdict().items():
+ pos = match_obj.start(group)
+ # Store also match pos in line, for later order check.
+ matchDict = {group: {'value': value, 'pos': pos}}
+ dateDict.update(matchDict)
# all fields matched -> date valid
- if all(g in dateDict for g in self.groups):
- # remove 'time' key, now split in hour/minute and not needed by datetime
+ # groups are in a reasonable order.
+ if (all(g in dateDict for g in self.groups) and
+ self._valid_date_dict_order(dateDict)):
+ # remove 'time' key, now split in hour/minute and not needed
+ # by datetime.
del dateDict['time']
# replace month name in original language with month number
try:
- dateDict['month'] = self.origNames2monthNum[dateDict['month']]
+ value = self.origNames2monthNum[dateDict['month']['value']]
except KeyError:
pywikibot.output(u'incorrect month name "%s" in page in site %s'
- % (dateDict['month'], self.site))
+ % (dateDict['month']['value'], self.site))
raise KeyError
+ else:
+ dateDict['month']['value'] = value
- # convert to integers
+ # convert to integers and remove the inner dict
for k, v in dateDict.items():
if k == 'tzinfo':
continue
try:
- dateDict[k] = int(v)
+ dateDict[k] = int(v['value'])
except ValueError:
raise ValueError('Value: %s could not be converted for key: %s.'
- % (v, k))
+ % (v['value'], k))
# find timezone
dateDict['tzinfo'] = self.tzinfo
diff --git a/tests/archivebot_tests.py b/tests/archivebot_tests.py
index 9ea2ab8..4cf2e43 100644
--- a/tests/archivebot_tests.py
+++ b/tests/archivebot_tests.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""Tests for archivebot scripts."""
#
-# (C) Pywikibot team, 2014
+# (C) Pywikibot team, 2016
#
# Distributed under the terms of the MIT license.
#
@@ -28,6 +28,10 @@
'nl': 9, 'nn': 0, 'no': 0, 'pdc': 25, 'pfl': 3, 'pl': 8, 'pt': 0, 'ro': 1,
'ru': 20, 'scn': 2, 'simple': 1, 'sr': 0, 'sv': 5, 'th': 1, 'tr': 7,
'ug': 0, 'uk': 1, 'uz': 1, 'vi': 1, 'zh': 4, 'zh-yue': 2,
+}
+
+THREADS_WITH_UPDATED_FORMAT = {
+ 'eo': 1, 'pdc': 1,
}
@@ -79,7 +83,7 @@
raise
expected_failures = ['ar', 'eo', 'pdc', 'th']
- # FIXME:
+ # FIXME: see TestArchiveBotAfterDateUpdate()
# 'ar': Uses Arabic acronym for TZ
# 'eo': changed month name setting in wiki from Sep to sep
# Localisation updates from https://translatewiki.net.
@@ -93,6 +97,58 @@
# 'th': year is 2552 while regex assumes 19..|20.., might be fixed
+class TestArchiveBotAfterDateUpdate(TestCase):
+
+ """
+ Test archivebot script on failures on Wikipedia sites.
+
+ If failure is due to updated date format on wiki, test pages with
+ new format only.
+ """
+
+ family = 'wikipedia'
+ sites = dict([(code, {'family': 'wikipedia', 'code': code})
+ for code in THREADS_WITH_UPDATED_FORMAT])
+
+ cached = True
+
+ def test_archivebot(self, code=None):
+ """Test archivebot for one site."""
+ site = self.get_site(code)
+ page = pywikibot.Page(site, 'user talk:mpaa')
+ talk = archivebot.DiscussionPage(page, None)
+ self.assertIsInstance(talk.archives, dict)
+ self.assertIsInstance(talk.archived_threads, int)
+ self.assertTrue(talk.archiver is None)
+ self.assertIsInstance(talk.header, basestring)
+ self.assertIsInstance(talk.timestripper, TimeStripper)
+
+ self.assertIsInstance(talk.threads, list)
+ self.assertGreaterEqual(
+ len(talk.threads), THREADS_WITH_UPDATED_FORMAT[code],
+ u'%d Threads found on %s,\n%d or more expected'
+ % (len(talk.threads), talk, THREADS_WITH_UPDATED_FORMAT[code]))
+
+ for thread in talk.threads:
+ self.assertIsInstance(thread, archivebot.DiscussionThread)
+ self.assertIsInstance(thread.title, basestring)
+ self.assertIsInstance(thread.now, datetime)
+ self.assertEqual(thread.now, talk.now)
+ self.assertIsInstance(thread.ts, TimeStripper)
+ self.assertEqual(thread.ts, talk.timestripper)
+ self.assertIsInstance(thread.code, basestring)
+ self.assertEqual(thread.code, talk.timestripper.site.code)
+ self.assertIsInstance(thread.content, basestring)
+ try:
+ self.assertIsInstance(thread.timestamp, datetime)
+ except AssertionError:
+ if thread.code not in self.expected_failures:
+ pywikibot.output('code %s: %s' % (thread.code, thread.content))
+ raise
+
+ expected_failures = []
+
+
if __name__ == '__main__': # pragma: no cover
try:
unittest.main()
diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py
index 8c7ab0a..90eca23 100644
--- a/tests/timestripper_tests.py
+++ b/tests/timestripper_tests.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""Tests for archivebot.py/Timestripper."""
#
-# (C) Pywikibot team, 2014
+# (C) Pywikibot team, 2016
#
# Distributed under the terms of the MIT license.
#
@@ -10,15 +10,16 @@
__version__ = '$Id$'
import datetime
+import re
from pywikibot.textlib import TimeStripper, tzoneFixedOffset
from tests.aspects import (
unittest,
TestCase,
- DefaultSiteTestCase,
- DeprecationTestCase,
)
+
+MatchObject = type(re.search('', ''))
class TestTimeStripperCase(TestCase):
@@ -31,19 +32,6 @@
"""Set up test cases."""
super(TestTimeStripperCase, self).setUp()
self.ts = TimeStripper(self.get_site())
-
-
-class DeprecatedTestTimeStripperCase(TestTimeStripperCase, DeprecationTestCase,
- DefaultSiteTestCase):
-
- """Test deprecated parts of the TimeStripper class."""
-
- def test_findmarker(self):
- """Test that string which is not part of text is found."""
- txt = u'this is a string with a maker is @@@@already present'
- self.assertEqual(self.ts.findmarker(txt, base=u'@@', delta='@@'),
- '@@@@@@')
- self.assertOneDeprecation()
class TestTimeStripperWithNoDigitsAsMonths(TestTimeStripperCase):
@@ -60,15 +48,19 @@
txtWithNoMatch = u'this string has no match'
pat = self.ts.pyearR
- self.assertEqual(self.ts.last_match_and_replace(txtWithOneMatch, pat),
- (u'this string has 3000, @@ and 3000 in it',
- {'year': u'1999'})
- )
- self.assertEqual(self.ts.last_match_and_replace(txtWithTwoMatch, pat),
- (u'this string has @@, @@ and 3000 in it',
- {'year': u'1999'})
- )
- self.assertEqual(self.ts.last_match_and_replace(txtWithNoMatch, pat),
+ txt, m = self.ts._last_match_and_replace(txtWithOneMatch, pat)
+ self.assertEqual('this string has 3000, @@@@ and 3000 in it', txt)
+ self.assertIsInstance(m, MatchObject)
+ self.assertEqual(m.groupdict(), {'year': '1999'})
+ self.assertEqual(m.start(), 22)
+
+ txt, m = self.ts._last_match_and_replace(txtWithTwoMatch, pat)
+ self.assertEqual('this string has @@@@, @@@@ and 3000 in it', txt)
+ self.assertIsInstance(m, MatchObject)
+ self.assertEqual(m.groupdict(), {'year': '1999'})
+ self.assertEqual(m.start(), 22)
+
+ self.assertEqual(self.ts._last_match_and_replace(txtWithNoMatch, pat),
(txtWithNoMatch,
None)
)
@@ -79,19 +71,25 @@
txtWithNoMatch = u'this string has no match'
pat = self.ts.pmonthR
- self.assertEqual(self.ts.last_match_and_replace(txtWithOneMatch, pat),
- (u'this string has XXX, YYY and @@ in it',
- {'month': u'février'})
- )
- self.assertEqual(self.ts.last_match_and_replace(txtWithTwoMatch, pat),
- (u'this string has XXX, @@ and @@ in it',
- {'month': u'février'})
- )
- self.assertEqual(self.ts.last_match_and_replace(txtWithThreeMatch, pat),
- (u'this string has @@, @@ and @@ in it',
- {'month': u'février'})
- )
- self.assertEqual(self.ts.last_match_and_replace(txtWithNoMatch, pat),
+ txt, m = self.ts._last_match_and_replace(txtWithOneMatch, pat)
+ self.assertEqual('this string has XXX, YYY and @@@@@@@ in it', txt)
+ self.assertIsInstance(m, MatchObject)
+ self.assertEqual(m.groupdict(), {'month': 'février'})
+ self.assertEqual(m.start(), 29)
+
+ txt, m = self.ts._last_match_and_replace(txtWithTwoMatch, pat)
+ self.assertEqual('this string has XXX, @@@@ and @@@@@@@ in it', txt)
+ self.assertIsInstance(m, MatchObject)
+ self.assertEqual(m.groupdict(), {'month': 'février'})
+ self.assertEqual(m.start(), 30)
+
+ txt, m = self.ts._last_match_and_replace(txtWithThreeMatch, pat)
+ self.assertEqual('this string has @@@, @@@@ and @@@@@@@ in it', txt)
+ self.assertIsInstance(m, MatchObject)
+ self.assertEqual(m.groupdict(), {'month': 'février'})
+ self.assertEqual(m.start(), 30)
+
+ self.assertEqual(self.ts._last_match_and_replace(txtWithNoMatch, pat),
(txtWithNoMatch,
None)
)
@@ -120,19 +118,25 @@
txtWithNoMatch = u'this string has no match'
pat = self.ts.pmonthR
- self.assertEqual(self.ts.last_match_and_replace(txtWithOneMatch, pat),
- (u'this string has XX. YY. 12. in it',
- {'month': u'12.'})
- )
- self.assertEqual(self.ts.last_match_and_replace(txtWithTwoMatch, pat),
- (u'this string has XX. 1. 12. in it',
- {'month': u'12.'})
- )
- self.assertEqual(self.ts.last_match_and_replace(txtWithThreeMatch, pat),
- (u'this string has @@ 1. 12. in it',
- {'month': u'12.'})
- )
- self.assertEqual(self.ts.last_match_and_replace(txtWithNoMatch, pat),
+ txt, m = self.ts._last_match_and_replace(txtWithOneMatch, pat)
+ self.assertEqual('this string has XX. YY. 12. in it', txt)
+ self.assertIsInstance(m, MatchObject)
+ self.assertEqual(m.groupdict(), {'month': '12.'})
+ self.assertEqual(m.start(), 24)
+
+ txt, m = self.ts._last_match_and_replace(txtWithTwoMatch, pat)
+ self.assertEqual('this string has XX. 1. 12. in it', txt)
+ self.assertIsInstance(m, MatchObject)
+ self.assertEqual(m.groupdict(), {'month': '12.'})
+ self.assertEqual(m.start(), 23)
+
+ txt, m = self.ts._last_match_and_replace(txtWithThreeMatch, pat)
+ self.assertEqual('this string has @@ 1. 12. in it', txt)
+ self.assertIsInstance(m, MatchObject)
+ self.assertEqual(m.groupdict(), {'month': '12.'})
+ self.assertEqual(m.start(), 22)
+
+ self.assertEqual(self.ts._last_match_and_replace(txtWithNoMatch, pat),
(txtWithNoMatch,
None)
)
--
To view, visit https://gerrit.wikimedia.org/r/282241
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Id237912710cfd767e6d0638eabb7819a3dd8519b
Gerrit-PatchSet: 9
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Dachary <loic(a)dachary.org>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>
jenkins-bot has submitted this change and it was merged.
Change subject: pywikibot.DataSite.mergeItem ignore_conflicts is str
......................................................................
pywikibot.DataSite.mergeItem ignore_conflicts is str
As documented in
https://phabricator.wikimedia.org/diffusion/EWBA/browse/master/repo/include…,
the ignore_conflicts parameter is a list of strings, not a bool.
Bug: T138289
Change-Id: Ia759640891a564699553ad334a9b3b0005814b82
---
M pywikibot/site.py
1 file changed, 5 insertions(+), 3 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/site.py b/pywikibot/site.py
index 4b7969b..a147460 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -7371,7 +7371,7 @@
@must_be(group='user')
@deprecated_args(ignoreconflicts='ignore_conflicts')
- def mergeItems(self, fromItem, toItem, ignore_conflicts=False,
+ def mergeItems(self, fromItem, toItem, ignore_conflicts=None,
summary=None, bot=True):
"""
Merge two items together.
@@ -7380,8 +7380,10 @@
@type fromItem: pywikibot.ItemPage
@param toItem: Item to merge into
@type toItem: pywikibot.ItemPage
- @param ignore_conflicts: Whether to ignore conflicts
- @type ignore_conflicts: bool
+ @param ignore_conflicts: Which type of conflicts
+ ('description', 'sitelink', and 'statement')
+ should be ignored
+ @type ignore_conflicts: list of str
@param summary: Edit summary
@type summary: str
@param bot: Whether to mark the edit as a bot edit
--
To view, visit https://gerrit.wikimedia.org/r/301664
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ia759640891a564699553ad334a9b3b0005814b82
Gerrit-PatchSet: 7
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dachary <loic(a)dachary.org>
Gerrit-Reviewer: Dachary <loic(a)dachary.org>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: MtDu <justin.d128(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>