jenkins-bot has submitted this change and it was merged.
Change subject: Set user-agent and convert reflinks.py to use requests ......................................................................
Set user-agent and convert reflinks.py to use requests
*Create get_user_agent function *Create default user agent config variable *Removal of 'b' in self.CHARSET and addition of str in 625 are python 3 fixes *Switch to requests will no longer support ftp
Bug: T113596 Bug: T111300 Bug: T118674 Change-Id: I09e0954f37f8a0fa9dc7554693a8de7b27dfd500 --- M pywikibot/comms/http.py M pywikibot/config2.py M scripts/reflinks.py 3 files changed, 77 insertions(+), 53 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py index 42f48d8..0d20e5b 100644 --- a/pywikibot/comms/http.py +++ b/pywikibot/comms/http.py @@ -52,7 +52,12 @@ FatalServerError, Server504Error, Server414Error ) from pywikibot.logging import critical, debug, error, log, warning -from pywikibot.tools import deprecate_arg, issue_deprecation_warning, PY2 +from pywikibot.tools import ( + deprecate_arg, + issue_deprecation_warning, + PY2, + StringTypes, +)
import pywikibot.version
@@ -199,6 +204,33 @@ return formatted
+def get_fake_user_agent(): + """ + Return a user agent to be used when faking a web browser. + + @rtype: str + """ + # Check fake_user_agent configuration variable + if isinstance(config.fake_user_agent, StringTypes): + return pywikibot.config2.fake_user_agent + + if config.fake_user_agent is None or config.fake_user_agent is True: + try: + import browseragents + return browseragents.core.random() + except ImportError: + pass + + try: + import fake_useragent + return fake_useragent.fake.UserAgent().random + except ImportError: + pass + + # Use the default real user agent + return user_agent() + + @deprecate_arg('ssl', None) def request(site=None, uri=None, method='GET', body=None, headers=None, **kwargs): diff --git a/pywikibot/config2.py b/pywikibot/config2.py index 5547271..9b73444 100644 --- a/pywikibot/config2.py +++ b/pywikibot/config2.py @@ -137,6 +137,17 @@ user_agent_format = ('{script_product} ({script_comments}) {pwb} ({revision}) ' '{http_backend} {python}')
+# Fake user agent +# Used to retrieve pages in reflinks.py, +# to work around user-agent sniffing webpages +# When None or True, +# Use random user agent if either browseragents or fake_useragent +# packages are installed +# Otherwise use pywikibot.comms.http.user_agent() +# When set to False, +# disables use of automatic user agents +fake_user_agent = None + # The default interface for communicating with the site # currently the only defined interface is 'APISite', so don't change this! site_interface = 'APISite' diff --git a/scripts/reflinks.py b/scripts/reflinks.py index 27a3702..0474deb 100755 --- a/scripts/reflinks.py +++ b/scripts/reflinks.py @@ -47,8 +47,6 @@ #
import codecs -import gzip -import io import os import re import socket @@ -60,22 +58,22 @@
import pywikibot
-from pywikibot import i18n, pagegenerators, textlib, Bot +from pywikibot import comms, i18n, pagegenerators, textlib, Bot from pywikibot.pagegenerators import ( XMLDumpPageGenerator as _XMLDumpPageGenerator, ) from pywikibot.tools.formatter import color_format
+import requests + from scripts import noreferences
if sys.version_info[0] > 2: - from urllib.parse import quote - from urllib.request import urlopen - from urllib.error import HTTPError, URLError import http.client as httplib + from urllib.error import URLError else: - from urllib2 import quote, urlopen, HTTPError, URLError import httplib + from urllib2 import URLError
docuReplacements = { '¶ms;': pagegenerators.parameterHelp @@ -178,7 +176,7 @@ # Regex that match bare references linksInRef = re.compile( # bracketed URLs - r'(?i)<ref(?P<name>[^>]*)>\s*[?(?P<url>(?:http|https|ftp)://(?:' + + r'(?i)<ref(?P<name>[^>]*)>\s*[?(?P<url>(?:http|https)://(?:' + # unbracketed with() r'^[]\s<>"]+([^[]\s<>"]+[^[]\s.:;\,<>?"]+|' + # unbracketed without () @@ -394,6 +392,8 @@ super(ReferencesRobot, self).__init__(**kwargs) self.generator = generator self.site = pywikibot.Site() + self._user_agent = comms.http.get_fake_user_agent() + pywikibot.log('Using fake user agent: {0}'.format(self._user_agent)) # Check manual = 'mw:Manual:Pywikibot/refLinks' code = None @@ -428,7 +428,7 @@ # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(br'(?i)<meta[^>]*content-type[^>]*>') # Extract the encoding from a charset property (from content-type !) - self.CHARSET = re.compile(br'(?i)charset\s*=\s*(?P<enc>[^'",;>/]*)') + self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^'",;>/]*)') # Extract html title from page self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments @@ -454,7 +454,8 @@ pywikibot.output(u'PDF file.') fd, infile = tempfile.mkstemp() urlobj = os.fdopen(fd, 'r+w') - urlobj.write(f.read()) + urlobj.write(f.content) + try: pdfinfo_out = subprocess.Popen([r"pdfinfo", "/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, @@ -488,8 +489,9 @@ 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz ' 'and to ungzip it in the same directory') raise - socket.setdefaulttimeout(30) + editedpages = 0 + headers = {'user-agent': self._user_agent} for page in self.generator: try: # Load the page's text from the wiki @@ -519,19 +521,12 @@
ref = RefLink(link, match.group('name')) f = None + try: - socket.setdefaulttimeout(20) - try: - f = urlopen(ref.url.decode("utf8")) - except UnicodeError: - ref.url = quote(ref.url.encode("utf8"), "://") - f = urlopen(ref.url) + f = requests.get(ref.url, headers=headers, timeout=60) + # Try to get Content-Type from server - headers = f.info() - if sys.version_info[0] > 2: - contentType = headers.get_content_type() - else: - contentType = headers.getheader('Content-Type') + contentType = f.headers.get('content-type') if contentType and not self.MIME.search(contentType): if ref.link.lower().endswith('.pdf') and \ not self.getOption('ignorepdf'): @@ -556,8 +551,9 @@ repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue + # Get the real url where we end (http redirects !) - redir = f.geturl() + redir = f.url if redir != ref.link and \ domain.findall(redir) == domain.findall(link): if soft404.search(redir) and \ @@ -573,37 +569,26 @@ u'Redirect to root : {0} ', ref.link)) continue
- # uncompress if necessary - if headers.get('Content-Encoding') in ('gzip', 'x-gzip'): - # XXX: small issue here: the whole page is downloaded - # through f.read(). It might fetch big files/pages. - # However, truncating an encoded gzipped stream is not - # an option, or unzipping will fail. - compressed = io.BytesIO(f.read()) - f = gzip.GzipFile(fileobj=compressed) + if f.status_code != requests.codes.ok: + pywikibot.output(u'HTTP error (%s) for %s on %s' + % (f.status_code, ref.url, + page.title(asLink=True)), + toStdout=True) + # 410 Gone, indicates that the resource has been purposely + # removed + if f.status_code == 410 or \ + (f.status_code == 404 and (u'\t%s\t' % ref.url in deadLinks)): + repl = ref.refDead() + new_text = new_text.replace(match.group(), repl) + continue
- # Read the first 1,000,000 bytes (0.95 MB) - linkedpagetext = f.read(1000000) - socket.setdefaulttimeout(None) - + linkedpagetext = f.content except UnicodeError: # example : http://www.adminet.com/jo/20010615%C2%A6/ECOC0100037D.html # in [[fr:Cyanure]] pywikibot.output(color_format( '{lightred}Bad link{default} : %s in %s', ref.url, page.title(asLink=True))) - continue - except HTTPError as e: - pywikibot.output(u'HTTP error (%s) for %s on %s' - % (e.code, ref.url, - page.title(asLink=True)), - toStdout=True) - # 410 Gone, indicates that the resource has been purposely - # removed - if e.code == 410 or \ - (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)): - repl = ref.refDead() - new_text = new_text.replace(match.group(), repl) continue except (URLError, socket.error, @@ -616,10 +601,6 @@ # Known bug of httplib, google for : # "httplib raises ValueError reading chunked content" continue - finally: - if f: - f.close() - # remove <script>/<style>/comments/CDATA tags linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
@@ -636,7 +617,7 @@ contentType = tag if not s: # use charset from html - s = self.CHARSET.search(tag) + s = self.CHARSET.search(str(tag)) if s: tmp = s.group('enc').strip(""' ").lower() naked = re.sub(r'[ _-]', '', tmp)