jenkins-bot has submitted this change and it was merged.
Change subject: [FIX] patrol: Replace mwlib with mwparserfromhell
......................................................................
[FIX] patrol: Replace mwlib with mwparserfromhell
This removes the dependency of `mwlib` (which doesn't work on Python 3)
and uses `mwparserfromhell` instead. Unfortunately does the `mwparserfromhell`
implementation not support lists directly so the parsed result has to be
filtered manually.
It now allows links outside of lists and normalizes the titles and checks
against the aliases for `Special:PrefixIndex`.
Bug: T95142
Change-Id: I4f7b5c7a67e0c90530319fce1f3ab0ca0c1a1138
---
M pywikibot/version.py
M requirements.txt
M scripts/patrol.py
M setup.py
M tests/patrolbot_tests.py
M tests/script_tests.py
M tests/utils.py
7 files changed, 67 insertions(+), 54 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/version.py b/pywikibot/version.py
index 7f5c64d..5ac6e1d 100644
--- a/pywikibot/version.py
+++ b/pywikibot/version.py
@@ -533,14 +533,6 @@
info['ver'] = package.__version__
elif name.startswith('unicodedata'):
info['ver'] = package.unidata_version
- elif name == 'mwlib': # mwlib 0.14.3 does not include a __init__.py
- module = __import__(name + '._version',
- fromlist=['_version'], level=0)
- if '__version__' in module.__dict__:
- info['ver'] = module.__version__
- path = module.__file__
- path = path[0:path.index('_version.')]
- info['path'] = path
# If builtins or standard_lib is None,
# only include package if a version was found.
diff --git a/requirements.txt b/requirements.txt
index cad79bf..66f32bf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -73,7 +73,7 @@
# incomplete core component botirc
irc ; python_version > '2.6'
-# textlib.py
+# textlib.py and patrol.py
mwparserfromhell>=0.3.3 ; python_version <= '3.4'
# The mysql generator in pagegenerators depends on either oursql or MySQLdb
diff --git a/scripts/patrol.py b/scripts/patrol.py
index e37907d..0adacee 100755
--- a/scripts/patrol.py
+++ b/scripts/patrol.py
@@ -5,9 +5,30 @@
This bot obtains a list of recent changes and newpages and marks the
edits as patrolled based on a whitelist.
-See
http://en.wikisource.org/wiki/User:JVbot/patrol_whitelist
-Commandline parameters that are supported:
+Whitelist format
+================
+
+The whitelist is formatted as a number of list entries. Any links outside of
+lists are ignored and can be used for documentation. In a list the first link
+must be to the username which should be white listed and any other link
+following is adding that page to the white list of that username. If the user
+edited a page on their white list it gets patrolled. It will also patrol pages
+which start with the mentioned link (e.g. [[foo]] will also patrol [[foobar]]).
+
+To avoid redlinks it's possible to use Special:PrefixIndex as a prefix so that
+it will list all pages which will be patrolled. The page after the slash will be
+used then.
+
+On Wikisource, it'll also check if the page is on the author namespace in which
+case it'll also patrol pages which are linked from that page.
+
+An example can be found at:
+
+https://en.wikisource.org/wiki/User:Wikisource-bot/patrol_whitelist
+
+Commandline parameters
+======================
-namespace Filter the page generator to only yield pages in
specified namespaces
@@ -25,9 +46,9 @@
from __future__ import absolute_import, unicode_literals
__version__ = '$Id$'
-import mwlib.uparser # used to parse the whitelist
-import mwlib.parser # used to parse the whitelist
import time
+
+import mwparserfromhell
import pywikibot
@@ -92,6 +113,13 @@
self.rc_item_counter = 0 # counts how many items have been reviewed
self.patrol_counter = 0 # and how many times an action was taken
+ for entry in self.site.siteinfo['specialpagealiases']:
+ if entry['realname'] == 'Prefixindex':
+ self._prefixindex_aliases = set(alias.lower()
+ for alias in entry['aliases'])
+ break
+ else:
+ raise RuntimeError('No alias for "prefixindex"')
def load_whitelist(self):
"""Load most recent watchlist_page for further
processing."""
@@ -183,34 +211,35 @@
"""Parse page details apart from 'user:' for
use."""
tuples = {}
- # for any structure, the only first 'user:' page
- # is registered as the user the rest of the structure
- # refers to.
- def process_children(obj, current_user):
- pywikibot.debug(u'Parsing node: %s' % obj, _logger)
- for c in obj.children:
- temp = process_node(c, current_user)
- if temp and not current_user:
- current_user = temp
+ current_user = False
+ parsed = mwparserfromhell.parse(wikitext)
+ for node in parsed.nodes:
+ if isinstance(node, mwparserfromhell.nodes.tag.Tag):
+ if node.tag == 'li':
+ current_user = None
+ elif isinstance(node, mwparserfromhell.nodes.text.Text):
+ if node.endswith('\n'):
+ current_user = False
+ elif isinstance(node, mwparserfromhell.nodes.wikilink.Wikilink):
+ if current_user is False:
+ pywikibot.debug('Link to "{0}" ignored as outside
'
+ 'list'.format(node.title), _logger)
+ continue
- def process_node(obj, current_user):
- # links are analysed; interwiki links are included because mwlib
- # incorrectly calls 'Wikisource:' namespace links an interwiki
- if isinstance(obj, mwlib.parser.NamespaceLink) or \
- isinstance(obj, mwlib.parser.InterwikiLink) or \
- isinstance(obj, mwlib.parser.ArticleLink):
+ obj = pywikibot.Link(node.title, self.site)
if obj.namespace == -1:
# the parser accepts 'special:prefixindex/' as a wildcard
# this allows a prefix that doesnt match an existing page
# to be a blue link, and can be clicked to see what pages
# will be included in the whitelist
- if obj.target[:20].lower() == 'special:prefixindex/':
- if len(obj.target) == 20:
+ name, sep, prefix = obj.title.partition('/')
+ if name.lower() in self._prefixindex_aliases:
+ if not prefix:
if pywikibot.config.verbose_output:
pywikibot.output(u'Whitelist everything')
page = ''
else:
- page = obj.target[20:]
+ page = prefix
if pywikibot.config.verbose_output:
pywikibot.output(u'Whitelist prefixindex hack '
u'for: %s' % page)
@@ -222,13 +251,12 @@
# if a target user hasn't been found yet, and the link is
# 'user:'
# the user will be the target of subsequent rules
- page_prefix_len = len(self.site.namespace(2))
- current_user = obj.target[(page_prefix_len + 1):]
+ current_user = obj.title
if pywikibot.config.verbose_output:
pywikibot.output(u'Whitelist user: %s' % current_user)
- return current_user
+ continue
else:
- page = obj.target
+ page = obj.canonical_title()
if current_user:
if not user or current_user == user:
@@ -246,11 +274,6 @@
u'another user: %s' % page)
else:
raise Exception(u'No user set for page %s' % page)
- else:
- process_children(obj, current_user)
-
- root = mwlib.uparser.parseString(title='Not used', raw=wikitext)
- process_children(root, None)
return tuples
diff --git a/setup.py b/setup.py
index 6a120b1..ad37305 100644
--- a/setup.py
+++ b/setup.py
@@ -69,6 +69,7 @@
'flickrripper.py': ['Pillow'],
'states_redirect.py': ['pycountry'],
'weblinkchecker.py': ['memento_client>=0.5.1'],
+ 'patrol.py': ['mwparserfromhell>=0.3.3'],
}
# flickrapi 1.4.4 installs a root logger in verbose mode; 1.4.5 fixes this.
# The problem doesnt exist in flickrapi 2.x.
@@ -133,9 +134,6 @@
script_deps['data_ingestion.py'] = extra_deps['csv']
- # mwlib is not available for py3
- script_deps['patrol'] = ['mwlib']
-
# Some of the ui_tests depend on accessing the console window's menu
# to set the console font and copy and paste, achieved using pywinauto
# which depends on pywin32.
@@ -157,11 +155,6 @@
# so all scripts can be compiled for script_tests, etc.
if 'PYSETUP_TEST_EXTRAS' in os.environ:
test_deps += list(itertools.chain(*(extra_deps.values())))
- # mwlib requires 'pyparsing>=1.4.11,<1.6', which conflicts with
- # pydot's requirement for pyparsing>=2.0.1.
- if 'mwlib' in test_deps:
- test_deps.remove('mwlib')
-
if 'oursql' in test_deps and os.name == 'nt':
test_deps.remove('oursql') # depends on Cython
diff --git a/tests/patrolbot_tests.py b/tests/patrolbot_tests.py
index 961d637..efaa29d 100644
--- a/tests/patrolbot_tests.py
+++ b/tests/patrolbot_tests.py
@@ -12,7 +12,7 @@
try:
from scripts import patrol
except ImportError:
- patrol = None # if mwlib is not installed
+ patrol = None # if mwparserfromhell is not installed
from tests.aspects import require_modules, unittest, DefaultDrySiteTestCase
@@ -21,17 +21,17 @@
== Header ==
* [[User:Test 1]]: [[Page 1]], [[Page 2]]
-* [[User:Test 2]]: [[Page 2]], [[Page 4]], [[Page 6]]
+* [[User:Test_2]]: [[Page 2]], [[Page 4]], [[Page 6]]
== Others ==
* [[User:Prefixed]]: [[Special:PrefixIndex/Page 1]], [[Special:PREFIXINDEX/Page 2]]
== More test 1 ==
-* [[User:Test 1]]: [[Page 3]]
+* [[User:Test_1]]: [[Page 3]]
"""
-@require_modules('mwlib')
+@require_modules('mwparserfromhell')
class TestPatrolBot(DefaultDrySiteTestCase):
"""Test the PatrolBot class."""
@@ -51,7 +51,7 @@
for i in range(1, 4)])
self.assertIn('Prefixed', tuples)
self.assertEqual(tuples['Prefixed'], ['Page 1', 'Page
2'])
- self.assertRaises(Exception, self.bot.parse_page_tuples, '[[link]]')
+ self.assertEqual(self.bot.parse_page_tuples('[[link]]'), {})
def test_in_list(self):
"""Test the method which returns whether a page is in the
list."""
diff --git a/tests/script_tests.py b/tests/script_tests.py
index 10cd8d1..66bd657 100644
--- a/tests/script_tests.py
+++ b/tests/script_tests.py
@@ -44,7 +44,7 @@
'match_images': ['PIL.ImageTk'],
'panoramiopicker': ['BeautifulSoup'],
'states_redirect': ['pycountry'],
- 'patrol': ['mwlib'],
+ 'patrol': ['mwparserfromhell'],
}
if PYTHON_VERSION < (2, 7):
diff --git a/tests/utils.py b/tests/utils.py
index 69f2fae..b4335be 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -374,6 +374,11 @@
if self.family.name == 'wikisource':
extensions.append({'name': 'ProofreadPage'})
self._siteinfo._cache['extensions'] = (extensions, True)
+ aliases = []
+ for alias in ('PrefixIndex', ):
+ # TODO: Not all follow that scheme (e.g. "BrokenRedirects")
+ aliases.append({'realname': alias.capitalize(), 'aliases':
[alias]})
+ self._siteinfo._cache['specialpagealiases'] = (aliases, True)
self._msgcache = {'*': 'dummy entry', 'hello':
'world'}
def _build_namespaces(self):
--
To view, visit
https://gerrit.wikimedia.org/r/202011
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I4f7b5c7a67e0c90530319fce1f3ab0ca0c1a1138
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>