jenkins-bot has submitted this change and it was merged.
Change subject: Improve HTML parser to detect all IWM MW sites
......................................................................
Improve HTML parser to detect all IWM MW sites
The HTML parser used only EditURI to determine the api.php endpoint.
Add HTML parser support for
* OpenSearch's opensearch_desc.php, introduced in MW 1.8, and
* Resource Loader's load.php, introduced in MW 1.17.
Also raise an exception for unsupported versions, lower than MW 1.14.
And fix flake8 issues in site_detect module.
Bug: T111007
Change-Id: Iebadd7f782ab1b471d3de71fa1c5a52c0d6f1018
---
M pywikibot/site_detect.py
M tests/site_detect_tests.py
M tox.ini
3 files changed, 244 insertions(+), 56 deletions(-)
Approvals:
John Vandenberg: Looks good to me, but someone else must approve
XZise: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/site_detect.py b/pywikibot/site_detect.py
index 13bfe27..7dd2587 100644
--- a/pywikibot/site_detect.py
+++ b/pywikibot/site_detect.py
@@ -13,17 +13,21 @@
import json
import re
-from distutils.version import LooseVersion as V
+import pywikibot
from pywikibot.comms.http import fetch
-from pywikibot.tools import PY2, PYTHON_VERSION
+from pywikibot.exceptions import ServerError
+from pywikibot.tools import MediaWikiVersion, PY2, PYTHON_VERSION
if not PY2:
from html.parser import HTMLParser
- from urllib.parse import urljoin
+ from urllib.parse import urljoin, urlparse
else:
- from HTMLParser import HTMLParser
- from urlparse import urljoin
+ try:
+ from future.backports.html.parser import HTMLParser
+ except ImportError:
+ from HTMLParser import HTMLParser
+ from urlparse import urljoin, urlparse
class MWSite(object):
@@ -38,25 +42,59 @@
REwgVersion = re.compile(r'wgVersion ?= ?"([^"]*)"')
def __init__(self, fromurl):
- self.fromurl = fromurl
+ """
+ Constructor.
+
+ @raises ServerError: a server error occurred while loading the site
+ @raises Timeout: a timeout occurred while loading the site
+ @raises RuntimeError: Version not found or version less than 1.14
+ """
if fromurl.endswith("$1"):
fromurl = fromurl[:-2]
- data = fetch(fromurl).content
+ r = fetch(fromurl)
+ if r.status == 503:
+ raise ServerError('Service Unavailable')
- wp = WikiHTMLPageParser()
+ if fromurl != r.data.url:
+ pywikibot.log('{0} redirected to {1}'.format(fromurl, r.data.url))
+ fromurl = r.data.url
+
+ self.fromurl = fromurl
+
+ data = r.content
+
+ wp = WikiHTMLPageParser(fromurl)
wp.feed(data)
- try:
- self.version = wp.generator.replace("MediaWiki ", "")
- except Exception:
- self.version = "0.0"
- if V(self.version) < V("1.17.0"):
+ self.version = wp.version
+ self.server = wp.server
+ self.scriptpath = wp.scriptpath
+ self.articlepath = None
+
+ try:
self._parse_pre_117(data)
- else:
- self._parse_post_117(wp, fromurl)
+ except Exception as e:
+ pywikibot.log('MW pre-1.17 detection failed: {0!r}'.format(e))
+
+ if self.api:
+ try:
+ self._parse_post_117()
+ except Exception as e:
+ pywikibot.log('MW 1.17+ detection failed: {0!r}'.format(e))
+
+ if not self.version:
+ self._fetch_old_version()
+
+ if not self.api:
+ raise RuntimeError('Unsupported url: {0}'.format(self.fromurl))
+
+ if (not self.version or
+ self.version < MediaWikiVersion('1.14')):
+ raise RuntimeError('Unsupported version: {0}'.format(self.version))
@property
def langs(self):
+ """Build interwikimap."""
response = fetch(
self.api +
"?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json")
@@ -69,52 +107,78 @@
return self.langs
def _parse_pre_117(self, data):
+ """Parse HTML."""
if not self.REwgEnableApi.search(data):
- print("*** WARNING: Api does not seem to be enabled on %s"
- % self.fromurl)
+ pywikibot.log(
+ 'wgEnableApi is not enabled in HTML of %s'
+ % self.fromurl)
try:
- self.version = self.REwgVersion.search(data).groups()[0]
+ self.version = MediaWikiVersion(
+ self.REwgVersion.search(data).groups(0))
except AttributeError:
- self.version = None
+ pass
self.server = self.REwgServer.search(data).groups()[0]
self.scriptpath = self.REwgScriptPath.search(data).groups()[0]
self.articlepath = self.REwgArticlePath.search(data).groups()[0]
self.lang = self.REwgContentLanguage.search(data).groups()[0]
+ def _fetch_old_version(self):
+ """Extract the version from API help with ?version
enabled."""
if self.version is None:
- # try to get version using api
try:
- d = json.loads(fetch(self.api +
'?version&format=json').content)
+ d = fetch(self.api + '?version&format=json').content
+ try:
+ d = json.loads(d)
+ except ValueError:
+ # Fallback for old versions which didnt wrap help in json
+ d = {'error': {'*': d}}
+
self.version = list(filter(
lambda x: x.startswith("MediaWiki"),
[l.strip()
for l in
d['error']['*'].split("\n")]))[0].split()[1]
except Exception:
pass
+ else:
+ self.version = MediaWikiVersion(self.version)
- def _parse_post_117(self, wp, fromurl):
- apipath = wp.edituri.split("?")[0]
- fullurl = urljoin(fromurl, apipath)
- response = fetch(fullurl +
'?action=query&meta=siteinfo&format=json')
+ def _parse_post_117(self):
+ """Parse 1.17+ siteinfo data."""
+ response = fetch(self.api +
'?action=query&meta=siteinfo&format=json')
info = json.loads(response.content)['query']['general']
- self.server = urljoin(fromurl, info['server'])
+ self.version = MediaWikiVersion.from_generator(info['generator'])
+ if self.version < MediaWikiVersion('1.17'):
+ return
+
+ self.server = urljoin(self.fromurl, info['server'])
for item in ['scriptpath', 'articlepath', 'lang']:
setattr(self, item, info[item])
- def __cmp__(self, other):
+ def __eq__(self, other):
+ """Return True if equal to other."""
return (self.server + self.scriptpath ==
other.server + other.scriptpath)
def __hash__(self):
+ """Get hashable representation."""
return hash(self.server + self.scriptpath)
@property
def api(self):
+ """
+ Get api URL.
+
+ @rtype: str or None
+ """
+ if self.server is None or self.scriptpath is None:
+ return
+
return self.server + self.scriptpath + "/api.php"
@property
def iwpath(self):
+ """Get article path URL."""
return self.server + self.articlepath
@@ -122,18 +186,85 @@
"""Wiki HTML page parser."""
- def __init__(self):
+ def __init__(self, url):
+ """Constructor."""
if PYTHON_VERSION < (3, 4):
HTMLParser.__init__(self)
else:
super().__init__(convert_charrefs=True)
+ self.url = urlparse(url)
self.generator = None
+ self.version = None
+ self._parsed_url = None
+ self.server = None
+ self.scriptpath = None
+
+ def set_version(self, value):
+ """Set highest version."""
+ if self.version and value < self.version:
+ return
+
+ self.version = value
+
+ def set_api_url(self, url):
+ """Set api_url."""
+ url = url.split('.php', 1)[0]
+ (value, script_name) = url.rsplit('/', 1)
+ if script_name not in ('api', 'load',
'opensearch_desc'):
+ return
+
+ if script_name == 'load':
+ self.set_version(MediaWikiVersion('1.17.0'))
+ if self._parsed_url:
+ # A Resource Loader link is less reliable than other links.
+ # Resource Loader can load resources from a different site.
+ # e.g.
http://kino.skripov.com/index.php/$1
+ # loads resources from
http://megawiki.net/
+ return
+
+ new_parsed_url = urlparse(value)
+ if self._parsed_url:
+ assert new_parsed_url.path == self._parsed_url.path
+
+ if not new_parsed_url.scheme or not new_parsed_url.netloc:
+ new_parsed_url = urlparse(
+ '{0}://{1}{2}'.format(
+ new_parsed_url.scheme or self.url.scheme,
+ new_parsed_url.netloc or self.url.netloc,
+ new_parsed_url.path))
+ else:
+ if self._parsed_url:
+ # allow upgrades to https, but not downgrades
+ if self._parsed_url.scheme == 'https':
+ if new_parsed_url.scheme != self._parsed_url.scheme:
+ return
+
+ # allow
http://www.brickwiki.info/ vs
http://brickwiki.info/
+ if (new_parsed_url.netloc in self._parsed_url.netloc or
+ self._parsed_url.netloc in new_parsed_url.netloc):
+ return
+
+ assert new_parsed_url == self._parsed_url, '{0} != {1}'.format(
+ self._parsed_url, new_parsed_url)
+
+ self._parsed_url = new_parsed_url
+ self.server = '{0}://{1}'.format(
+ self._parsed_url.scheme, self._parsed_url.netloc)
+ self.scriptpath = self._parsed_url.path
def handle_starttag(self, tag, attrs):
+ """Handle an opening tag."""
attrs = dict(attrs)
if tag == "meta":
if attrs.get('name') == 'generator':
self.generator = attrs["content"]
- if tag == "link":
- if attrs.get('rel') == 'EditURI':
- self.edituri = attrs["href"]
+ try:
+ self.version = MediaWikiVersion.from_generator(
+ self.generator)
+ except ValueError:
+ pass
+ elif tag == 'link' and 'rel' in attrs and 'href' in
attrs:
+ if attrs['rel'] in ('EditURI', 'stylesheet',
'search'):
+ self.set_api_url(attrs['href'])
+ elif tag == 'script' and 'src' in attrs:
+ self.set_api_url(attrs['src'])
diff --git a/tests/site_detect_tests.py b/tests/site_detect_tests.py
index f13105f..bcad8cb 100644
--- a/tests/site_detect_tests.py
+++ b/tests/site_detect_tests.py
@@ -11,8 +11,9 @@
from requests.exceptions import Timeout
+from pywikibot.exceptions import ServerError
from pywikibot.site_detect import MWSite
-from pywikibot.tools import PY2
+from pywikibot.tools import MediaWikiVersion, PY2
from tests.aspects import unittest, TestCase
@@ -23,10 +24,6 @@
class TestWikiSiteDetection(TestCase):
"""Test Case for MediaWiki detection and site object
creation."""
-
- family = 'meta'
- code = 'meta'
- net = True
def setUp(self):
"""Set up test."""
@@ -75,7 +72,7 @@
self.all += [url]
try:
site = MWSite(url)
- except Timeout as e:
+ except (ServerError, Timeout) as e:
self.skips[url] = e
return
except Exception as e:
@@ -88,7 +85,7 @@
self.assertIsNone(site)
else:
self.assertIsInstance(site, result)
- self.passes[url] = result
+ self.passes[url] = site
except AssertionError as error:
self.failures[url] = error
@@ -102,15 +99,24 @@
def assertAllPass(self):
"""Assert that all urls were detected as a MediaWiki
site."""
- self.assertEqual(len(self.passes), len(self.all) - len(self.skips))
- self.assertEqual(len(self.failures), 0)
- self.assertEqual(len(self.errors), 0)
+ self.assertEqual(set(self.passes), set(self.all) - set(self.skips))
+ self.assertEqual(self.failures, {})
+ self.assertEqual(self.errors, {})
def assertAllError(self):
"""Assert that all urls were not detected as a MediaWiki
site."""
- self.assertEqual(len(self.passes), 0)
- self.assertEqual(len(self.failures), 0)
- self.assertEqual(len(self.errors), len(self.all) - len(self.skips))
+ self.assertEqual(self.passes, {})
+ self.assertEqual(self.failures, {})
+ self.assertEqual(set(self.errors), set(self.all) - set(self.skips))
+
+
+class InterWikiMapDetection(TestWikiSiteDetection):
+
+ """Test all urls on the interwiki map."""
+
+ family = 'meta'
+ code = 'meta'
+ net = True
def test_IWM(self):
"""Test the load_site method for MW sites on the IWM
list."""
@@ -133,39 +139,85 @@
self.errors[url] = error
else:
try:
- self.assertIsInstance(version, basestring)
- self.assertRegex(version, r'^\d\.\d+.*')
+ self.assertIsInstance(version, MediaWikiVersion)
self.passes[url] = site
except AssertionError as error:
print('failed to parse version of ' + url)
self.failures[url] = error
+
+class SiteDetectionTestCase(TestWikiSiteDetection):
+
+ """Test all urls on the interwiki map."""
+
+ net = True
+
def test_detect_site(self):
"""Test detection of MediaWiki sites."""
self.assertSite('http://botwiki.sno.cc/wiki/$1')
-
self.assertSite('http://glossary.reuters.com/index.php?title=$1')
- self.assertSite('http://www.livepedia.gr/index.php?title=$1')
self.assertSite('http://guildwars.wikia.com/wiki/$1')
-
self.assertSite('http://www.hrwiki.org/index.php/$1')
+
self.assertSite('http://www.hrwiki.org/index.php/$1') # v 1.15
self.assertSite('http://www.proofwiki.org/wiki/$1')
self.assertSite(
'http://www.ck-wissen.de/ckwiki/index.php?title=$1')
self.assertSite('http://en.citizendium.org/wiki/$1')
self.assertSite(
'http://www.lojban.org/tiki/tiki-index.php?page=$1')
-
self.assertSite('http://www.EcoReality.org/wiki/$1')
self.assertSite('http://www.wikichristian.org/index.php?title=$1')
-
self.assertSite('http://wikitree.org/index.php?title=$1')
+
self.assertSite('https://en.wikifur.com/wiki/$1')
+
self.assertSite('http://bluwiki.com/go/$1')
+
self.assertSite('http://kino.skripov.com/index.php/$1')
+ self.assertAllPass()
+
+ def test_wikisophia(self):
+ """Test
wikisophia.org which has redirect
problems."""
+ # /index.php?title=$1 reports 404, however a wiki exists there,
+ # but the API is also hidden.
+
self.assertNoSite('http://wikisophia.org/index.php?title=$1')
+ self.assertAllError()
+
+ def test_pre_114_sites(self):
+ """Test pre 1.14 sites which should be detected as
unsupported."""
+ # v1.12
+ self.assertNoSite('http://www.livepedia.gr/index.php?title=$1')
+ # v1.11
+
self.assertNoSite('http://www.wikifon.org/$1')
+
self.assertNoSite('http://glossary.reuters.com/index.php?title=$1')
+ # v1.11, with no query module
+
self.assertNoSite('http://wikitree.org/index.php?title=$1')
+ # v1.9
+
self.assertNoSite('http://www.wikinvest.com/$1')
+ self.assertAllError()
+
+ def test_non_standard_version_sites(self):
+ """Test non-standard version string sites."""
+
self.assertSite('https://wiki.gentoo.org/wiki/$1')
+
self.assertSite('http://wiki.arabeyes.org/$1')
+
self.assertSite('http://tfwiki.net/wiki/$1')
self.assertAllPass()
def test_detect_failure(self):
"""Test detection failure for MediaWiki sites with an
API."""
-
self.assertNoSite('https://en.wikifur.com/wiki/$1')
+ # SSL certificate verification fails
+
self.assertNoSite('http://hackerspaces.org/wiki/$1')
+ self.assertAllError()
+
+ @unittest.expectedFailure
+ def test_api_hidden(self):
+ """Test MediaWiki sites with a hidden enabled
API."""
# api.php is not available
self.assertNoSite('http://wiki.animutationportal.com/index.php/$1')
- # API is disabled
+ # HTML looks like it has an API, but redirect rules prevent access
+
self.assertNoSite('http://www.EcoReality.org/wiki/$1')
+ self.assertAllError()
+
+ def test_api_disabled(self):
+ """Test MediaWiki sites without an enabled API."""
self.assertNoSite('http://wiki.linuxquestions.org/wiki/$1')
- # offline
+ self.assertAllError()
+
+ def test_offline_sites(self):
+ """Test offline sites."""
self.assertNoSite('http://seattlewiki.org/wiki/$1')
self.assertAllError()
@@ -181,7 +233,6 @@
def test_detect_nosite(self):
"""Test detection of non-wiki sites."""
-
self.assertNoSite('http://bluwiki.com/go/$1')
self.assertNoSite('http://www.imdb.com/name/nm$1/')
self.assertNoSite('http://www.ecyrd.com/JSPWiki/Wiki.jsp?page=$1')
self.assertNoSite('http://operawiki.info/$1')
@@ -192,8 +243,13 @@
self.assertNoSite(
'http://www.merriam-webster.com/cgi-bin/dictionary?book=Dictionary&va=$1')
self.assertNoSite('http://arxiv.org/abs/$1')
+ self.assertAllError()
+
+ def test_musicbrainz_doc(self):
+ """Test
http://musicbrainz.org/doc/ which has a page
'api.php'."""
+ # Possible false positive caused by the existance of a page
+ # called
http://musicbrainz.org/doc/api.php
self.assertNoSite('http://musicbrainz.org/doc/$1')
-
self.assertNoSite('http://wiki.animutationportal.com/index.php/$1')
self.assertAllError()
diff --git a/tox.ini b/tox.ini
index 728dc58..15ecea9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -64,6 +64,7 @@
pywikibot/pagegenerators.py \
pywikibot/plural.py \
pywikibot/proofreadpage.py \
+ pywikibot/site_detect.py \
pywikibot/textlib.py \
pywikibot/throttle.py \
pywikibot/titletranslate.py \
--
To view, visit
https://gerrit.wikimedia.org/r/230512
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Iebadd7f782ab1b471d3de71fa1c5a52c0d6f1018
Gerrit-PatchSet: 16
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>