[Gerrit] Set user-agent and convert reflinks.py to use requests - change (pywikibot/core) - Pywikibot-commits

19 Jan 2016

jenkins-bot has submitted this change and it was merged.
Change subject: Set user-agent and convert reflinks.py to use requests
......................................................................
Set user-agent and convert reflinks.py to use requests
*Create get_user_agent function
*Create default user agent config variable
*Removal of 'b' in self.CHARSET and addition of str in 625 are python 3 fixes
*Switch to requests will no longer support ftp
Bug: T113596
Bug: T111300
Bug: T118674
Change-Id: I09e0954f37f8a0fa9dc7554693a8de7b27dfd500
---
M pywikibot/comms/http.py
M pywikibot/config2.py
M scripts/reflinks.py
3 files changed, 77 insertions(+), 53 deletions(-)
Approvals:
  John Vandenberg: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index 42f48d8..0d20e5b 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -52,7 +52,12 @@
     FatalServerError, Server504Error, Server414Error
 )
 from pywikibot.logging import critical, debug, error, log, warning
-from pywikibot.tools import deprecate_arg, issue_deprecation_warning, PY2
+from pywikibot.tools import (
+    deprecate_arg,
+    issue_deprecation_warning,
+    PY2,
+    StringTypes,
+)
import pywikibot.version
@@ -199,6 +204,33 @@
     return formatted
+def get_fake_user_agent():
+    """
+    Return a user agent to be used when faking a web browser.
+
+    @rtype: str
+    """
+    # Check fake_user_agent configuration variable
+    if isinstance(config.fake_user_agent, StringTypes):
+        return pywikibot.config2.fake_user_agent
+
+    if config.fake_user_agent is None or config.fake_user_agent is True:
+        try:
+            import browseragents
+            return browseragents.core.random()
+        except ImportError:
+            pass
+
+        try:
+            import fake_useragent
+            return fake_useragent.fake.UserAgent().random
+        except ImportError:
+            pass
+
+    # Use the default real user agent
+    return user_agent()
+
+
 @deprecate_arg('ssl', None)
 def request(site=None, uri=None, method='GET', body=None, headers=None,
             **kwargs):
diff --git a/pywikibot/config2.py b/pywikibot/config2.py
index 5547271..9b73444 100644
--- a/pywikibot/config2.py
+++ b/pywikibot/config2.py
@@ -137,6 +137,17 @@
 user_agent_format = ('{script_product} ({script_comments}) {pwb} ({revision}) '
                      '{http_backend} {python}')
+# Fake user agent
+# Used to retrieve pages in reflinks.py,
+# to work around user-agent sniffing webpages
+# When None or True,
+# Use random user agent if either browseragents or fake_useragent
+# packages are installed
+# Otherwise use pywikibot.comms.http.user_agent()
+# When set to False,
+# disables use of automatic user agents
+fake_user_agent = None
+
 # The default interface for communicating with the site
 # currently the only defined interface is 'APISite', so don't change this!
 site_interface = 'APISite'
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 27a3702..0474deb 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -47,8 +47,6 @@
 #
import codecs
-import gzip
-import io
 import os
 import re
 import socket
@@ -60,22 +58,22 @@
import pywikibot
-from pywikibot import i18n, pagegenerators, textlib, Bot
+from pywikibot import comms, i18n, pagegenerators, textlib, Bot
 from pywikibot.pagegenerators import (
     XMLDumpPageGenerator as _XMLDumpPageGenerator,
 )
 from pywikibot.tools.formatter import color_format
+import requests
+
 from scripts import noreferences
if sys.version_info[0] > 2:
-    from urllib.parse import quote
-    from urllib.request import urlopen
-    from urllib.error import HTTPError, URLError
     import http.client as httplib
+    from urllib.error import URLError
 else:
-    from urllib2 import quote, urlopen, HTTPError, URLError
     import httplib
+    from urllib2 import URLError
docuReplacements = {
     '&params;': pagegenerators.parameterHelp
@@ -178,7 +176,7 @@
 # Regex that match bare references
 linksInRef = re.compile(
     # bracketed URLs
-    r'(?i)<ref(?P<name>[^>]*)>\s*[?(?P<url>(?:http|https|ftp)://(?:' +
+    r'(?i)<ref(?P<name>[^>]*)>\s*[?(?P<url>(?:http|https)://(?:' +
     # unbracketed with()
     r'^[]\s<>"]+([^[]\s<>"]+[^[]\s.:;\,<>?"]+|' +
     # unbracketed without ()
@@ -394,6 +392,8 @@
         super(ReferencesRobot, self).__init__(**kwargs)
         self.generator = generator
         self.site = pywikibot.Site()
+        self._user_agent = comms.http.get_fake_user_agent()
+        pywikibot.log('Using fake user agent: {0}'.format(self._user_agent))
         # Check
         manual = 'mw:Manual:Pywikibot/refLinks'
         code = None
@@ -428,7 +428,7 @@
         # Regex to grasp content-type meta HTML tag in HTML source
         self.META_CONTENT = re.compile(br'(?i)<meta[^>]*content-type[^>]*>')
         # Extract the encoding from a charset property (from content-type !)
-        self.CHARSET = re.compile(br'(?i)charset\s*=\s*(?P<enc>[^'",;>/]*)')
+        self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^'",;>/]*)')
         # Extract html title from page
         self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)')
         # Matches content inside <script>/<style>/HTML comments
@@ -454,7 +454,8 @@
         pywikibot.output(u'PDF file.')
         fd, infile = tempfile.mkstemp()
         urlobj = os.fdopen(fd, 'r+w')
-        urlobj.write(f.read())
+        urlobj.write(f.content)
+
         try:
             pdfinfo_out = subprocess.Popen([r"pdfinfo", "/dev/stdin"],
                                            stdin=urlobj, stdout=subprocess.PIPE,
@@ -488,8 +489,9 @@
                 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz '
                 'and to ungzip it in the same directory')
             raise
-        socket.setdefaulttimeout(30)
+
         editedpages = 0
+        headers = {'user-agent': self._user_agent}
         for page in self.generator:
             try:
                 # Load the page's text from the wiki
@@ -519,19 +521,12 @@
ref = RefLink(link, match.group('name'))
                 f = None
+
                 try:
-                    socket.setdefaulttimeout(20)
-                    try:
-                        f = urlopen(ref.url.decode("utf8"))
-                    except UnicodeError:
-                        ref.url = quote(ref.url.encode("utf8"), "://")
-                        f = urlopen(ref.url)
+                    f = requests.get(ref.url, headers=headers, timeout=60)
+
                     # Try to get Content-Type from server
-                    headers = f.info()
-                    if sys.version_info[0] > 2:
-                        contentType = headers.get_content_type()
-                    else:
-                        contentType = headers.getheader('Content-Type')
+                    contentType = f.headers.get('content-type')
                     if contentType and not self.MIME.search(contentType):
                         if ref.link.lower().endswith('.pdf') and \
                            not self.getOption('ignorepdf'):
@@ -556,8 +551,9 @@
                             repl = ref.refLink()
                         new_text = new_text.replace(match.group(), repl)
                         continue
+
                     # Get the real url where we end (http redirects !)
-                    redir = f.geturl()
+                    redir = f.url
                     if redir != ref.link and \
                        domain.findall(redir) == domain.findall(link):
                         if soft404.search(redir) and \
@@ -573,37 +569,26 @@
                                 u'Redirect to root : {0} ', ref.link))
                             continue
-                    # uncompress if necessary
-                    if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
-                        # XXX: small issue here: the whole page is downloaded
-                        # through f.read(). It might fetch big files/pages.
-                        # However, truncating an encoded gzipped stream is not
-                        # an option, or unzipping will fail.
-                        compressed = io.BytesIO(f.read())
-                        f = gzip.GzipFile(fileobj=compressed)
+                    if f.status_code != requests.codes.ok:
+                        pywikibot.output(u'HTTP error (%s) for %s on %s'
+                                         % (f.status_code, ref.url,
+                                            page.title(asLink=True)),
+                                         toStdout=True)
+                        # 410 Gone, indicates that the resource has been purposely
+                        # removed
+                        if f.status_code == 410 or \
+                           (f.status_code == 404 and (u'\t%s\t' % ref.url in deadLinks)):
+                            repl = ref.refDead()
+                            new_text = new_text.replace(match.group(), repl)
+                        continue
-                    # Read the first 1,000,000 bytes (0.95 MB)
-                    linkedpagetext = f.read(1000000)
-                    socket.setdefaulttimeout(None)
-
+                    linkedpagetext = f.content
                 except UnicodeError:
                     # example : http://www.adminet.com/jo/20010615%C2%A6/ECOC0100037D.html
                     # in [[fr:Cyanure]]
                     pywikibot.output(color_format(
                         '{lightred}Bad link{default} : %s in %s',
                         ref.url, page.title(asLink=True)))
-                    continue
-                except HTTPError as e:
-                    pywikibot.output(u'HTTP error (%s) for %s on %s'
-                                     % (e.code, ref.url,
-                                        page.title(asLink=True)),
-                                     toStdout=True)
-                    # 410 Gone, indicates that the resource has been purposely
-                    # removed
-                    if e.code == 410 or \
-                       (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)):
-                        repl = ref.refDead()
-                        new_text = new_text.replace(match.group(), repl)
                     continue
                 except (URLError,
                         socket.error,
@@ -616,10 +601,6 @@
                     # Known bug of httplib, google for :
                     # "httplib raises ValueError reading chunked content"
                     continue
-                finally:
-                    if f:
-                        f.close()
-
                 # remove <script>/<style>/comments/CDATA tags
                 linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
@@ -636,7 +617,7 @@
                         contentType = tag
                     if not s:
                         # use charset from html
-                        s = self.CHARSET.search(tag)
+                        s = self.CHARSET.search(str(tag))
                 if s:
                     tmp = s.group('enc').strip(""' ").lower()
                     naked = re.sub(r'[ _-]', '', tmp)
-- 
To view, visit https://gerrit.wikimedia.org/r/264251
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I09e0954f37f8a0fa9dc7554693a8de7b27dfd500
Gerrit-PatchSet: 43
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: MtDu justin.d128@gmail.com
Gerrit-Reviewer: 8ohit.dua 8ohit.dua@gmail.com
Gerrit-Reviewer: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: MtDu justin.d128@gmail.com
Gerrit-Reviewer: jenkins-bot <>