jenkins-bot has submitted this change and it was merged.
Change subject: Set user-agent and convert reflinks.py to use requests
......................................................................
Set user-agent and convert reflinks.py to use requests
*Create get_user_agent function
*Create default user agent config variable
*Removal of 'b' in self.CHARSET and addition of str in 625 are python 3 fixes
*Switch to requests will no longer support ftp
Bug: T113596
Bug: T111300
Bug: T118674
Change-Id: I09e0954f37f8a0fa9dc7554693a8de7b27dfd500
---
M pywikibot/comms/http.py
M pywikibot/config2.py
M scripts/reflinks.py
3 files changed, 77 insertions(+), 53 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index 42f48d8..0d20e5b 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -52,7 +52,12 @@
FatalServerError, Server504Error, Server414Error
)
from pywikibot.logging import critical, debug, error, log, warning
-from pywikibot.tools import deprecate_arg, issue_deprecation_warning, PY2
+from pywikibot.tools import (
+ deprecate_arg,
+ issue_deprecation_warning,
+ PY2,
+ StringTypes,
+)
import pywikibot.version
@@ -199,6 +204,33 @@
return formatted
+def get_fake_user_agent():
+ """
+ Return a user agent to be used when faking a web browser.
+
+ @rtype: str
+ """
+ # Check fake_user_agent configuration variable
+ if isinstance(config.fake_user_agent, StringTypes):
+ return pywikibot.config2.fake_user_agent
+
+ if config.fake_user_agent is None or config.fake_user_agent is True:
+ try:
+ import browseragents
+ return browseragents.core.random()
+ except ImportError:
+ pass
+
+ try:
+ import fake_useragent
+ return fake_useragent.fake.UserAgent().random
+ except ImportError:
+ pass
+
+ # Use the default real user agent
+ return user_agent()
+
+
@deprecate_arg('ssl', None)
def request(site=None, uri=None, method='GET', body=None, headers=None,
**kwargs):
diff --git a/pywikibot/config2.py b/pywikibot/config2.py
index 5547271..9b73444 100644
--- a/pywikibot/config2.py
+++ b/pywikibot/config2.py
@@ -137,6 +137,17 @@
user_agent_format = ('{script_product} ({script_comments}) {pwb} ({revision}) '
'{http_backend} {python}')
+# Fake user agent
+# Used to retrieve pages in reflinks.py,
+# to work around user-agent sniffing webpages
+# When None or True,
+# Use random user agent if either browseragents or fake_useragent
+# packages are installed
+# Otherwise use pywikibot.comms.http.user_agent()
+# When set to False,
+# disables use of automatic user agents
+fake_user_agent = None
+
# The default interface for communicating with the site
# currently the only defined interface is 'APISite', so don't change this!
site_interface = 'APISite'
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 27a3702..0474deb 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -47,8 +47,6 @@
#
import codecs
-import gzip
-import io
import os
import re
import socket
@@ -60,22 +58,22 @@
import pywikibot
-from pywikibot import i18n, pagegenerators, textlib, Bot
+from pywikibot import comms, i18n, pagegenerators, textlib, Bot
from pywikibot.pagegenerators import (
XMLDumpPageGenerator as _XMLDumpPageGenerator,
)
from pywikibot.tools.formatter import color_format
+import requests
+
from scripts import noreferences
if sys.version_info[0] > 2:
- from urllib.parse import quote
- from urllib.request import urlopen
- from urllib.error import HTTPError, URLError
import http.client as httplib
+ from urllib.error import URLError
else:
- from urllib2 import quote, urlopen, HTTPError, URLError
import httplib
+ from urllib2 import URLError
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp
@@ -178,7 +176,7 @@
# Regex that match bare references
linksInRef = re.compile(
# bracketed URLs
-
r'(?i)<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https|ftp)://(?:'
+
+
r'(?i)<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https)://(?:'
+
# unbracketed with()
r'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'
+
# unbracketed without ()
@@ -394,6 +392,8 @@
super(ReferencesRobot, self).__init__(**kwargs)
self.generator = generator
self.site = pywikibot.Site()
+ self._user_agent = comms.http.get_fake_user_agent()
+ pywikibot.log('Using fake user agent: {0}'.format(self._user_agent))
# Check
manual = 'mw:Manual:Pywikibot/refLinks'
code = None
@@ -428,7 +428,7 @@
# Regex to grasp content-type meta HTML tag in HTML source
self.META_CONTENT =
re.compile(br'(?i)<meta[^>]*content\-type[^>]*>')
# Extract the encoding from a charset property (from content-type !)
- self.CHARSET =
re.compile(br'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)')
+ self.CHARSET =
re.compile(r'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)')
# Extract html title from page
self.TITLE =
re.compile(r'(?is)(?<=<title>).*?(?=</title>)')
# Matches content inside <script>/<style>/HTML comments
@@ -454,7 +454,8 @@
pywikibot.output(u'PDF file.')
fd, infile = tempfile.mkstemp()
urlobj = os.fdopen(fd, 'r+w')
- urlobj.write(f.read())
+ urlobj.write(f.content)
+
try:
pdfinfo_out = subprocess.Popen([r"pdfinfo",
"/dev/stdin"],
stdin=urlobj, stdout=subprocess.PIPE,
@@ -488,8 +489,9 @@
'http://www.twoevils.org/files/wikipedia/404-links.txt.gz '
'and to ungzip it in the same directory')
raise
- socket.setdefaulttimeout(30)
+
editedpages = 0
+ headers = {'user-agent': self._user_agent}
for page in self.generator:
try:
# Load the page's text from the wiki
@@ -519,19 +521,12 @@
ref = RefLink(link, match.group('name'))
f = None
+
try:
- socket.setdefaulttimeout(20)
- try:
- f = urlopen(ref.url.decode("utf8"))
- except UnicodeError:
- ref.url = quote(ref.url.encode("utf8"),
"://")
- f = urlopen(ref.url)
+ f = requests.get(ref.url, headers=headers, timeout=60)
+
# Try to get Content-Type from server
- headers = f.info()
- if sys.version_info[0] > 2:
- contentType = headers.get_content_type()
- else:
- contentType = headers.getheader('Content-Type')
+ contentType = f.headers.get('content-type')
if contentType and not self.MIME.search(contentType):
if ref.link.lower().endswith('.pdf') and \
not self.getOption('ignorepdf'):
@@ -556,8 +551,9 @@
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
continue
+
# Get the real url where we end (http redirects !)
- redir = f.geturl()
+ redir = f.url
if redir != ref.link and \
domain.findall(redir) == domain.findall(link):
if soft404.search(redir) and \
@@ -573,37 +569,26 @@
u'Redirect to root : {0} ', ref.link))
continue
- # uncompress if necessary
- if headers.get('Content-Encoding') in ('gzip',
'x-gzip'):
- # XXX: small issue here: the whole page is downloaded
- # through f.read(). It might fetch big files/pages.
- # However, truncating an encoded gzipped stream is not
- # an option, or unzipping will fail.
- compressed = io.BytesIO(f.read())
- f = gzip.GzipFile(fileobj=compressed)
+ if f.status_code != requests.codes.ok:
+ pywikibot.output(u'HTTP error (%s) for %s on %s'
+ % (f.status_code, ref.url,
+ page.title(asLink=True)),
+ toStdout=True)
+ # 410 Gone, indicates that the resource has been purposely
+ # removed
+ if f.status_code == 410 or \
+ (f.status_code == 404 and (u'\t%s\t' % ref.url in
deadLinks)):
+ repl = ref.refDead()
+ new_text = new_text.replace(match.group(), repl)
+ continue
- # Read the first 1,000,000 bytes (0.95 MB)
- linkedpagetext = f.read(1000000)
- socket.setdefaulttimeout(None)
-
+ linkedpagetext = f.content
except UnicodeError:
# example :
http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
# in [[fr:Cyanure]]
pywikibot.output(color_format(
'{lightred}Bad link{default} : %s in %s',
ref.url, page.title(asLink=True)))
- continue
- except HTTPError as e:
- pywikibot.output(u'HTTP error (%s) for %s on %s'
- % (e.code, ref.url,
- page.title(asLink=True)),
- toStdout=True)
- # 410 Gone, indicates that the resource has been purposely
- # removed
- if e.code == 410 or \
- (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)):
- repl = ref.refDead()
- new_text = new_text.replace(match.group(), repl)
continue
except (URLError,
socket.error,
@@ -616,10 +601,6 @@
# Known bug of httplib, google for :
# "httplib raises ValueError reading chunked content"
continue
- finally:
- if f:
- f.close()
-
# remove <script>/<style>/comments/CDATA tags
linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
@@ -636,7 +617,7 @@
contentType = tag
if not s:
# use charset from html
- s = self.CHARSET.search(tag)
+ s = self.CHARSET.search(str(tag))
if s:
tmp = s.group('enc').strip("\"'
").lower()
naked = re.sub(r'[ _\-]', '', tmp)
--
To view, visit
https://gerrit.wikimedia.org/r/264251
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I09e0954f37f8a0fa9dc7554693a8de7b27dfd500
Gerrit-PatchSet: 43
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: MtDu <justin.d128(a)gmail.com>
Gerrit-Reviewer: 8ohit.dua <8ohit.dua(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: MtDu <justin.d128(a)gmail.com>
Gerrit-Reviewer: jenkins-bot <>