jenkins-bot has submitted this change and it was merged.
Change subject: Improve fake user agent usage control
......................................................................
Improve fake user agent usage control
comms.http.get_fake_user_agent() is renamed to fake_user_agent() to match the style of user_agent(). Logic checking config variable fake_user_agent is removed, as it should not be responsible for deciding whether if fake UA should be used. Test cases testing the config-checking logic are removed.
The use_fake_user_agent argument is added to comms.http.fetch(), which will specify if fake UAs should be used when the method is called to make HTTP requests. Test cases testing this logic are added.
The fake_user_agent config variable is deprecated. fake_user_agent_default is introduced to set per-script behaviour. fake_user_agent_exceptions is introduced to set per-domain behaviours (will be checked by fetch()).
Bug: T152075
Change-Id: I28594fd1b5ccb6ed3e885db5600bb0464dccfa0e
---
M pywikibot/comms/http.py
M pywikibot/config2.py
M scripts/reflinks.py
M scripts/weblinkchecker.py
M tests/http_tests.py
5 files changed, 186 insertions(+), 61 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index 908f3f1..7bf6235 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -38,10 +38,11 @@
if sys.version_info[0] > 2:
from http import cookiejar as cookielib
- from urllib.parse import quote
+ from urllib.parse import quote, urlparse
else:
import cookielib
from urllib2 import quote
+ from urlparse import urlparse
from pywikibot import config
@@ -53,6 +54,7 @@
)
from pywikibot.logging import critical, debug, error, log, warning
from pywikibot.tools import (
+ deprecated,
deprecate_arg,
file_mode_checker,
issue_deprecation_warning,
@@ -234,31 +236,43 @@
return formatted
+@deprecated('pywikibot.comms.http.fake_user_agent')
def get_fake_user_agent():
"""
- Return a user agent to be used when faking a web browser.
+ Return a fake user agent depending on `fake_user_agent` option in config.
+
+ Deprecated, use fake_user_agent() instead.
@rtype: str
"""
- # Check fake_user_agent configuration variable
if isinstance(config.fake_user_agent, StringTypes):
- return pywikibot.config2.fake_user_agent
+ return config.fake_user_agent
+ elif config.fake_user_agent or config.fake_user_agent is None:
+ return fake_user_agent()
+ else:
+ return user_agent()
- if config.fake_user_agent is None or config.fake_user_agent is True:
- try:
- import browseragents
- return browseragents.core.random()
- except ImportError:
- pass
- try:
- import fake_useragent
- return fake_useragent.fake.UserAgent().random
- except ImportError:
- pass
+def fake_user_agent():
+ """
+ Return a fake user agent.
- # Use the default real user agent
- return user_agent()
+ @rtype: str
+ """
+ try:
+ import browseragents
+ return browseragents.core.random()
+ except ImportError:
+ pass
+
+ try:
+ import fake_useragent
+ return fake_useragent.fake.UserAgent().random
+ except ImportError:
+ pass
+
+ raise ImportError( # Actually complain when neither is installed.
+ 'Either browseragents or fake_useragent must be installed to get fake UAs.')
@deprecate_arg('ssl', None)
@@ -443,7 +457,7 @@
def fetch(uri, method="GET", body=None, headers=None,
- default_error_handling=True, **kwargs):
+ default_error_handling=True, use_fake_user_agent=False, **kwargs):
"""
Blocking HTTP request.
@@ -454,8 +468,27 @@
@param default_error_handling: Use default error handling
@type default_error_handling: bool
+ @type use_fake_user_agent: bool, str
+ @param use_fake_user_agent: Set to True to use fake UA, False to use
+ pywikibot's UA, str to specify own UA. This behaviour might be
+ overridden by domain in config.
@rtype: L{threadedhttp.HttpRequest}
"""
+ # Change user agent depending on fake UA settings.
+ # Set header to new UA if needed.
+ headers = headers or {}
+ if not headers.get('user-agent', None): # Skip if already specified in request.
+ # Get fake UA exceptions from `fake_user_agent_exceptions` config.
+ uri_domain = urlparse(uri).netloc
+ use_fake_user_agent = config.fake_user_agent_exceptions.get(
+ uri_domain, use_fake_user_agent)
+
+ if use_fake_user_agent and isinstance(
+ use_fake_user_agent, StringTypes): # Custom UA.
+ headers['user-agent'] = use_fake_user_agent
+ elif use_fake_user_agent is True:
+ headers['user-agent'] = fake_user_agent()
+
request = _enqueue(uri, method, body, headers, **kwargs)
assert(request._data is not None) # if there's no data in the answer we're in trouble
# Run the error handling callback in the callers thread so exceptions
diff --git a/pywikibot/config2.py b/pywikibot/config2.py
index 9451eb5..a98aeb7 100644
--- a/pywikibot/config2.py
+++ b/pywikibot/config2.py
@@ -93,7 +93,7 @@
_private_values = ['authenticate', 'proxy', 'db_password']
_deprecated_variables = ['use_SSL_onlogin', 'use_SSL_always',
- 'available_ssl_project']
+ 'available_ssl_project', 'fake_user_agent']
# ############# ACCOUNT SETTINGS ##############
@@ -137,16 +137,22 @@
user_agent_format = ('{script_product} ({script_comments}) {pwb} ({revision}) '
'{http_backend} {python}')
-# Fake user agent
-# Used to retrieve pages in reflinks.py,
-# to work around user-agent sniffing webpages
-# When None or True,
-# Use random user agent if either browseragents or fake_useragent
-# packages are installed
-# Otherwise use pywikibot.comms.http.user_agent()
-# When set to False,
-# disables use of automatic user agents
-fake_user_agent = None
+# Fake user agent.
+# Some external websites reject bot-like user agents. It is possible to use
+# fake user agents in requests to these websites.
+# It is recommended to default this to False and use on an as-needed basis.
+#
+# Default behaviours in modules that can utilize fake UAs.
+# True for enabling fake UA, False for disabling / using pywikibot's own UA, str
+# to specify custom UA.
+fake_user_agent_default = {'reflinks': False, 'weblinkchecker': False}
+# Website domains excepted to the default behaviour.
+# True for enabling, False for disabling, str to hardcode a UA.
+# Example: {'problematic.site.example': True,
+# 'prefers.specific.ua.example': 'snakeoil/4.2'}
+fake_user_agent_exceptions = {}
+# This following option is deprecated in favour of finer control options above.
+fake_user_agent = False
# The default interface for communicating with the site
# currently the only defined interface is 'APISite', so don't change this!
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index c5daf1d..1e930a4 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -59,6 +59,7 @@
import pywikibot
from pywikibot import comms, i18n, pagegenerators, textlib, Bot
+from pywikibot import config2 as config
from pywikibot.pagegenerators import (
XMLDumpPageGenerator as _XMLDumpPageGenerator,
)
@@ -395,8 +396,7 @@
super(ReferencesRobot, self).__init__(**kwargs)
self.generator = generator
self.site = pywikibot.Site()
- self._user_agent = comms.http.get_fake_user_agent()
- pywikibot.log('Using fake user agent: {0}'.format(self._user_agent))
+ self._use_fake_user_agent = config.fake_user_agent_default.get('reflinks', False)
# Check
manual = 'mw:Manual:Pywikibot/refLinks'
code = None
@@ -494,7 +494,6 @@
raise
editedpages = 0
- headers = {'user-agent': self._user_agent}
for page in self.generator:
try:
# Load the page's text from the wiki
@@ -526,10 +525,11 @@
f = None
try:
- f = requests.get(ref.url, headers=headers, timeout=60)
+ f = comms.http.fetch(
+ ref.url, use_fake_user_agent=self._use_fake_user_agent)
# Try to get Content-Type from server
- contentType = f.headers.get('content-type')
+ contentType = f.response_headers.get('content-type')
if contentType and not self.MIME.search(contentType):
if ref.link.lower().endswith('.pdf') and \
not self.getOption('ignorepdf'):
@@ -556,7 +556,7 @@
continue
# Get the real url where we end (http redirects !)
- redir = f.url
+ redir = f.data.url
if redir != ref.link and \
domain.findall(redir) == domain.findall(link):
if soft404.search(redir) and \
@@ -572,15 +572,15 @@
u'Redirect to root : {0} ', ref.link))
continue
- if f.status_code != requests.codes.ok:
+ if f.status != requests.codes.ok:
pywikibot.output(u'HTTP error (%s) for %s on %s'
- % (f.status_code, ref.url,
+ % (f.status, ref.url,
page.title(asLink=True)),
toStdout=True)
# 410 Gone, indicates that the resource has been purposely
# removed
- if f.status_code == 410 or \
- (f.status_code == 404 and (u'\t%s\t' % ref.url in deadLinks)):
+ if f.status == 410 or \
+ (f.status == 404 and (u'\t%s\t' % ref.url in deadLinks)):
repl = ref.refDead()
new_text = new_text.replace(match.group(), repl)
continue
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index f81d8c3..b8cb323 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -279,6 +279,8 @@
Returns a (boolean, string) tuple saying if the page is online and including
a status reason.
+ Per-domain user-agent faking is not supported in this deprecated class.
+
Warning: Also returns false if your Internet connection isn't working
correctly! (This will give a Socket Error)
@@ -292,11 +294,19 @@
redirectChain is a list of redirects which were resolved by
resolveRedirect(). This is needed to detect redirect loops.
"""
- self._user_agent = comms.http.get_fake_user_agent()
self.url = url
self.serverEncoding = serverEncoding
+
+ fake_ua_config = config.fake_user_agent_default.get(
+ 'weblinkchecker', False)
+ if fake_ua_config and isinstance(fake_ua_config, str):
+ user_agent = fake_ua_config
+ elif fake_ua_config:
+ user_agent = comms.http.fake_user_agent()
+ else:
+ user_agent = comms.http.user_agent()
self.header = {
- 'User-agent': self._user_agent,
+ 'user-agent': user_agent,
'Accept': 'text/xml,application/xml,application/xhtml+xml,'
'text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'de-de,de;q=0.8,en-us;q=0.5,en;q=0.3',
@@ -542,10 +552,8 @@
threading.Thread.__init__(self)
self.page = page
self.url = url
- self._user_agent = comms.http.get_fake_user_agent()
self.history = history
self.header = {
- 'User-agent': self._user_agent,
'Accept': 'text/xml,application/xml,application/xhtml+xml,'
'text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'de-de,de;q=0.8,en-us;q=0.5,en;q=0.3',
@@ -557,6 +565,8 @@
self.setName((u'%s - %s' % (page.title(), url)).encode('utf-8',
'replace'))
self.HTTPignore = HTTPignore
+ self._use_fake_user_agent = config.fake_user_agent_default.get(
+ 'weblinkchecker', False)
self.day = day
def run(self):
@@ -564,8 +574,8 @@
ok = False
try:
header = self.header
- timeout = pywikibot.config.socket_timeout
- r = requests.get(self.url, headers=header, timeout=timeout)
+ r = comms.http.fetch(
+ self.url, headers=header, use_fake_user_agent=self._use_fake_user_agent)
except requests.exceptions.InvalidURL:
message = i18n.twtranslate(self.page.site,
'weblinkchecker-badurl_msg',
@@ -574,11 +584,11 @@
pywikibot.output('Exception while processing URL %s in page %s'
% (self.url, self.page.title()))
raise
- if (r.status_code == requests.codes.ok and
- str(r.status_code) not in self.HTTPignore):
+ if (r.status == requests.codes.ok and
+ str(r.status) not in self.HTTPignore):
ok = True
else:
- message = '{0} {1}'.format(r.status_code, r.reason)
+ message = '{0}'.format(r.status)
if ok:
if self.history.setLinkAlive(self.url):
pywikibot.output('*Link to %s in [[%s]] is back alive.'
diff --git a/tests/http_tests.py b/tests/http_tests.py
index 05df31b..06f7117 100644
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -285,7 +285,7 @@
self.assertIn('Python/' + str(PYTHON_VERSION[0]), http.user_agent())
-class FakeUserAgentTestCase(TestCase):
+class DryFakeUserAgentTestCase(TestCase):
"""Test the generation of fake user agents.
@@ -296,15 +296,96 @@
net = False
+ def _test_fake_user_agent_randomness(self):
+ """Test if user agent returns are randomized."""
+ self.assertNotEqual(http.fake_user_agent(), http.fake_user_agent())
+
+ @require_modules('browseragents')
+ def test_with_browseragents(self):
+ """Test fake user agent generation with browseragents module."""
+ self._test_fake_user_agent_randomness()
+
+ @require_modules('fake_useragent')
+ def test_with_fake_useragent(self):
+ """Test fake user agent generation with fake_useragent module."""
+ self._test_fake_user_agent_randomness()
+
+
+class LiveFakeUserAgentTestCase(TestCase):
+
+ """Test the usage of fake user agent."""
+
+ sites = {
+ 'httpbin': {
+ 'hostname': 'httpbin.org',
+ },
+ }
+
+ def setUp(self):
+ """Set up the unit test."""
+ self.orig_fake_user_agent_exceptions = config.fake_user_agent_exceptions
+ super(LiveFakeUserAgentTestCase, self).setUp()
+
+ def tearDown(self):
+ """Tear down unit test."""
+ config.fake_user_agent_exceptions = self.orig_fake_user_agent_exceptions
+ super(LiveFakeUserAgentTestCase, self).tearDown()
+
+ def _test_fetch_use_fake_user_agent(self):
+ """Test `use_fake_user_agent` argument of http.fetch."""
+ # Existing headers
+ r = http.fetch(
+ 'http://httpbin.org/status/200', headers={'user-agent': 'EXISTING'})
+ self.assertEqual(r.headers['user-agent'], 'EXISTING')
+
+ # Argument value changes
+ r = http.fetch('http://httpbin.org/status/200', use_fake_user_agent=True)
+ self.assertNotEqual(r.headers['user-agent'], http.user_agent())
+ r = http.fetch('http://httpbin.org/status/200', use_fake_user_agent=False)
+ self.assertEqual(r.headers['user-agent'], http.user_agent())
+ r = http.fetch(
+ 'http://httpbin.org/status/200', use_fake_user_agent='ARBITRARY')
+ self.assertEqual(r.headers['user-agent'], 'ARBITRARY')
+
+ # Manually overridden domains
+ config.fake_user_agent_exceptions = {'httpbin.org': 'OVERRIDDEN'}
+ r = http.fetch(
+ 'http://httpbin.org/status/200', use_fake_user_agent=False)
+ self.assertEqual(r.headers['user-agent'], 'OVERRIDDEN')
+
+ @require_modules('browseragents')
+ def test_fetch_with_browseragents(self):
+ """Test method with browseragents module."""
+ self._test_fetch_use_fake_user_agent()
+
+ @require_modules('fake_useragent')
+ def test_fetch_with_fake_useragent(self):
+ """Test method with fake_useragent module."""
+ self._test_fetch_use_fake_user_agent()
+
+
+class GetFakeUserAgentTestCase(TestCase):
+
+ """Test the deprecated get_fake_user_agent()."""
+
+ net = False
+
def setUp(self):
"""Set up unit test."""
self.orig_fake_user_agent = config.fake_user_agent
+ super(GetFakeUserAgentTestCase, self).setUp()
def tearDown(self):
"""Tear down unit test."""
config.fake_user_agent = self.orig_fake_user_agent
+ super(GetFakeUserAgentTestCase, self).tearDown()
- def _test_fake_user_agent_config(self):
+ def _test_fake_user_agent_randomness(self):
+ """Test if user agent returns are randomized."""
+ config.fake_user_agent = True
+ self.assertNotEqual(http.get_fake_user_agent(), http.get_fake_user_agent())
+
+ def _test_config_settings(self):
"""Test if method honours configuration toggle."""
# ON: True and None in config are considered turned on.
config.fake_user_agent = True
@@ -315,25 +396,20 @@
# OFF: All other values won't make it return random UA.
config.fake_user_agent = False
self.assertEqual(http.get_fake_user_agent(), http.user_agent())
- config.fake_user_agent = 'ArbitraryValue'
- self.assertEqual(http.get_fake_user_agent(), 'ArbitraryValue')
-
- def _test_fake_user_agent_randomness(self):
- """Test if user agent returns are randomized."""
- config.fake_user_agent = True
- self.assertNotEqual(http.get_fake_user_agent(), http.get_fake_user_agent())
+ config.fake_user_agent = 'ARBITRARY'
+ self.assertEqual(http.get_fake_user_agent(), 'ARBITRARY')
@require_modules('browseragents')
def test_with_browseragents(self):
- """Test fake user agent generation with browseragents module."""
- self._test_fake_user_agent_config()
+ """Test method with browseragents module."""
self._test_fake_user_agent_randomness()
+ self._test_config_settings()
@require_modules('fake_useragent')
def test_with_fake_useragent(self):
- """Test fake user agent generation with fake_useragent module."""
- self._test_fake_user_agent_config()
+ """Test method with fake_useragent module."""
self._test_fake_user_agent_randomness()
+ self._test_config_settings()
class CharsetTestCase(TestCase):
--
To view, visit https://gerrit.wikimedia.org/r/325241
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I28594fd1b5ccb6ed3e885db5600bb0464dccfa0e
Gerrit-PatchSet: 17
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dargasia <thx(a)riseup.net>
Gerrit-Reviewer: Dargasia <thx(a)riseup.net>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: jenkins-bot <>
jenkins-bot has submitted this change and it was merged.
Change subject: template.py: fix failed substitution in <poem> tag
......................................................................
template.py: fix failed substitution in <poem> tag
Bug: T151931
Change-Id: If8daaecb3ce343a0369c6f0ed60432126ed9abe0
---
M scripts/template.py
1 file changed, 5 insertions(+), 3 deletions(-)
Approvals:
Dalba: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/template.py b/scripts/template.py
index c4bbc6c..66f3e95 100755
--- a/scripts/template.py
+++ b/scripts/template.py
@@ -17,7 +17,9 @@
-subst Resolves the template by putting its text directly into the
article. This is done by changing {{...}} or {{msg:...}} into
- {{subst:...}}
+ {{subst:...}}.
+ Substitution is not available inside <ref>...</ref>,
+ <gallery>...</gallery> and <poem>...</poem> tags.
-assubst Replaces the first argument as old template with the second
argument as new template but substitutes it like -subst does.
@@ -221,11 +223,11 @@
if self.getOption('subst') and self.getOption('remove'):
replacements.append((templateRegex,
r'{{subst:%s\g<parameters>}}' % new))
- exceptions['inside-tags'] = ['ref', 'gallery']
+ exceptions['inside-tags'] = ['ref', 'gallery', 'poem']
elif self.getOption('subst'):
replacements.append((templateRegex,
r'{{subst:%s\g<parameters>}}' % old))
- exceptions['inside-tags'] = ['ref', 'gallery']
+ exceptions['inside-tags'] = ['ref', 'gallery', 'poem']
elif self.getOption('remove'):
replacements.append((templateRegex, ''))
else:
--
To view, visit https://gerrit.wikimedia.org/r/324267
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: If8daaecb3ce343a0369c6f0ed60432126ed9abe0
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Magul <tomasz.magulski(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: jenkins-bot <>