[Gerrit] ...core[master]: [IMPR] Rename ReferencesRobot.run to treat - Pywikibot-commits

30 Oct 2020

jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/636460 )
Change subject: [IMPR] Rename ReferencesRobot.run to treat
......................................................................
[IMPR] Rename ReferencesRobot.run to treat
This enables quitting the script without raising an exception.
Bug: T196851
Bug: T171713
Change-Id: I613507e22967b248a9c1c5a136ecc64d46e48bb6
---
M scripts/reflinks.py
1 file changed, 247 insertions(+), 252 deletions(-)
Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 7b9c621..881db61 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -504,263 +504,258 @@
                 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz%5Cn'
                 'and to unzip it in the same directory')
-    def run(self):
-        """Run the Bot."""
-        self.setup()
-        editedpages = 0
-        for page in self.generator:
-            try:
-                # Load the page's text from the wiki
-                new_text = page.get()
-                if not page.has_permission():
-                    pywikibot.output("You can't edit page "
-                                     + page.title(as_link=True))
-                    continue
-            except pywikibot.NoPage:
-                pywikibot.output('Page {} not found'
-                                 .format(page.title(as_link=True)))
-                continue
-            except pywikibot.IsRedirectPage:
-                pywikibot.output('Page {} is a redirect'
-                                 .format(page.title(as_link=True)))
-                continue
+    def treat(self, page):
+        """Process one page."""
+        try:
+            # Load the page's text from the wiki
+            new_text = page.get()
+            if not page.has_permission():
+                pywikibot.output("You can't edit page "
+                                 + page.title(as_link=True))
+                return
+        except pywikibot.NoPage:
+            pywikibot.output('Page {} not found'
+                             .format(page.title(as_link=True)))
+            return
+        except pywikibot.IsRedirectPage:
+            pywikibot.output('Page {} is a redirect'
+                             .format(page.title(as_link=True)))
+            return
-            # for each link to change
-            for match in linksInRef.finditer(
-                    textlib.removeDisabledParts(page.get())):
+        # for each link to change
+        for match in linksInRef.finditer(
+                textlib.removeDisabledParts(page.get())):
-                link = match.group('url')
-                # debugging purpose
-                # print link
-                if 'jstor.org' in link:
-                    # TODO: Clean URL blacklist
-                    continue
-
-                ref = RefLink(link, match.group('name'), site=self.site)
-
-                try:
-                    f = comms.http.fetch(
-                        ref.url, use_fake_user_agent=self._use_fake_user_agent)
-
-                    # Try to get Content-Type from server
-                    content_type = f.response_headers.get('content-type')
-                    if content_type and not self.MIME.search(content_type):
-                        if ref.link.lower().endswith('.pdf') and \
-                           not self.opt.ignorepdf:
-                            # If file has a PDF suffix
-                            self.getPDFTitle(ref, f)
-                        else:
-                            pywikibot.output(color_format(
-                                '{lightyellow}WARNING{default} : '
-                                'media : {0} ', ref.link))
-                        if ref.title:
-                            if not re.match(
-                                    '(?i) *microsoft (word|excel|visio)',
-                                    ref.title):
-                                ref.transform(ispdf=True)
-                                repl = ref.refTitle()
-                            else:
-                                pywikibot.output(color_format(
-                                    '{lightyellow}WARNING{default} : '
-                                    'PDF title blacklisted : {0} ', ref.title))
-                                repl = ref.refLink()
-                        else:
-                            repl = ref.refLink()
-                        new_text = new_text.replace(match.group(), repl)
-                        continue
-
-                    # Get the real url where we end (http redirects !)
-                    redir = f.data.url
-                    if redir != ref.link and \
-                       domain.findall(redir) == domain.findall(link):
-                        if soft404.search(redir) and \
-                           not soft404.search(ref.link):
-                            pywikibot.output(color_format(
-                                '{lightyellow}WARNING{default} : '
-                                'Redirect 404 : {0} ', ref.link))
-                            continue
-                        if dirIndex.match(redir) and \
-                           not dirIndex.match(ref.link):
-                            pywikibot.output(color_format(
-                                '{lightyellow}WARNING{default} : '
-                                'Redirect to root : {0} ', ref.link))
-                            continue
-
-                    if f.status_code != codes.ok:
-                        pywikibot.stdout('HTTP error ({}) for {} on {}'
-                                         .format(f.status_code, ref.url,
-                                                 page.title(as_link=True)))
-                        # 410 Gone, indicates that the resource has been
-                        # purposely removed
-                        if f.status_code == 410 \
-                           or (f.status_code == 404
-                               and '\t{}\t'.format(
-                                   ref.url) in self.dead_links):
-                            repl = ref.refDead()
-                            new_text = new_text.replace(match.group(), repl)
-                        continue
-
-                    linkedpagetext = f.raw
-                except UnicodeError:
-                    # example:
-                    # http://www.adminet.com/jo/20010615%C2%A6/ECOC0100037D.html
-                    # in [[fr:Cyanure]]
-                    pywikibot.output(color_format(
-                        '{lightred}Bad link{default} : {0} in {1}',
-                        ref.url, page.title(as_link=True)))
-                    continue
-                except (URLError,
-                        socket.error,
-                        IOError,
-                        httplib.error,
-                        pywikibot.FatalServerError,
-                        pywikibot.Server414Error,
-                        pywikibot.Server504Error) as e:
-                    pywikibot.output("Can't retrieve page {} : {}"
-                                     .format(ref.url, e))
-                    continue
-
-                # remove <script>/<style>/comments/CDATA tags
-                linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
-
-                meta_content = self.META_CONTENT.search(linkedpagetext)
-                enc = []
-                s = None
-                if content_type:
-                    # use charset from http header
-                    s = self.CHARSET.search(content_type)
-                if meta_content:
-                    tag = meta_content.group().decode()
-                    # Prefer the contentType from the HTTP header :
-                    if not content_type:
-                        content_type = tag
-                    if not s:
-                        # use charset from html
-                        s = self.CHARSET.search(tag)
-                if s:
-                    tmp = s.group('enc').strip('"' ').lower()
-                    naked = re.sub(r'[ _-]', '', tmp)
-                    # Convert to python correct encoding names
-                    if naked == 'gb2312':
-                        enc.append('gbk')
-                    elif naked == 'shiftjis':
-                        enc.append('shift jis 2004')
-                        enc.append('cp932')
-                    elif naked == 'xeucjp':
-                        enc.append('euc-jp')
-                    else:
-                        enc.append(tmp)
-                else:
-                    pywikibot.output('No charset found for ' + ref.link)
-
-                if not content_type:
-                    pywikibot.output('No content-type found for ' + ref.link)
-                    continue
-
-                if not self.MIME.search(content_type):
-                    pywikibot.output(color_format(
-                        '{lightyellow}WARNING{default} : media : {0} ',
-                        ref.link))
-                    repl = ref.refLink()
-                    new_text = new_text.replace(match.group(), repl)
-                    continue
-
-                # Ugly hacks to try to survive when both server and page
-                # return no encoding.
-                # Uses most used encodings for each national suffix
-                if '.ru' in ref.link or '.su' in ref.link:
-                    # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
-                    # encoding, no page encoding
-                    enc = enc + ['koi8-r', 'windows-1251']
-                elif '.jp' in ref.link:
-                    enc.append('shift jis 2004')
-                    enc.append('cp932')
-                elif '.kr' in ref.link:
-                    enc.append('euc-kr')
-                    enc.append('cp949')
-                elif '.zh' in ref.link:
-                    enc.append('gbk')
-
-                if 'utf-8' not in enc:
-                    enc.append('utf-8')
-                try:
-                    u = linkedpagetext.decode(enc[0])   # Bug T69410
-                except (UnicodeDecodeError, LookupError) as e:
-                    pywikibot.output('{} : Decoding error - {}'
-                                     .format(ref.link, e))
-                    continue
-
-                # Retrieves the first non empty string inside <title> tags
-                for m in self.TITLE.finditer(u):
-                    t = m.group()
-                    if t:
-                        ref.title = t
-                        ref.transform()
-                        if ref.title:
-                            break
-
-                if not ref.title:
-                    repl = ref.refLink()
-                    new_text = new_text.replace(match.group(), repl)
-                    pywikibot.output('{} : No title found...'.format(ref.link))
-                    continue
-
-                # XXX Ugly hack
-                if 'Ã©' in ref.title:
-                    repl = ref.refLink()
-                    new_text = new_text.replace(match.group(), repl)
-                    pywikibot.output('{} : Hybrid encoding...'
-                                     .format(ref.link))
-                    continue
-
-                if self.titleBlackList.match(ref.title):
-                    repl = ref.refLink()
-                    new_text = new_text.replace(match.group(), repl)
-                    pywikibot.output(color_format(
-                        '{lightred}WARNING{default} {0} : '
-                        'Blacklisted title ({1})', ref.link, ref.title))
-                    continue
-
-                # Truncate long titles. 175 is arbitrary
-                if len(ref.title) > 175:
-                    ref.title = ref.title[:175] + '...'
-
-                repl = ref.refTitle()
-                new_text = new_text.replace(match.group(), repl)
-
-            # Add <references/> when needed, but ignore templates !
-            if page.namespace != 10:
-                if self.norefbot.lacksReferences(new_text):
-                    new_text = self.norefbot.addReferences(new_text)
-
-            new_text = self.deduplicator.process(new_text)
-            old_text = page.text
-
-            self.userPut(page, old_text, new_text, summary=self.msg,
-                         ignore_save_related_errors=True,
-                         ignore_server_errors=True)
-
-            if new_text == old_text:
-                continue
-            else:
-                editedpages += 1
-
-            if self.opt.limit and editedpages >= self.opt.limit:
-                pywikibot.output('Edited {} pages, stopping.'
-                                 .format(self.opt.limit))
+            link = match.group('url')
+            # debugging purpose
+            # print link
+            if 'jstor.org' in link:
+                # TODO: Clean URL blacklist
                 return
-            if self.site_stop_page and editedpages % 20 == 0:
-                self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
-                if self.stop_page.exists():
-                    pywikibot.output(color_format(
-                        '{lightgreen}Checking stop page...{default}'))
-                    actual_rev = self.stop_page.latest_revision_id
-                    if actual_rev != self.stop_page_rev_id:
-                        pywikibot.output(
-                            '{} has been edited: Someone wants us to stop.'
-                            .format(self.stop_page.title(as_link=True)))
+            ref = RefLink(link, match.group('name'), site=self.site)
+
+            try:
+                f = comms.http.fetch(
+                    ref.url, use_fake_user_agent=self._use_fake_user_agent)
+
+                # Try to get Content-Type from server
+                content_type = f.response_headers.get('content-type')
+                if content_type and not self.MIME.search(content_type):
+                    if ref.link.lower().endswith('.pdf') and \
+                       not self.opt.ignorepdf:
+                        # If file has a PDF suffix
+                        self.getPDFTitle(ref, f)
+                    else:
+                        pywikibot.output(color_format(
+                            '{lightyellow}WARNING{default} : '
+                            'media : {0} ', ref.link))
+                    if ref.title:
+                        if not re.match(
+                                '(?i) *microsoft (word|excel|visio)',
+                                ref.title):
+                            ref.transform(ispdf=True)
+                            repl = ref.refTitle()
+                        else:
+                            pywikibot.output(color_format(
+                                '{lightyellow}WARNING{default} : '
+                                'PDF title blacklisted : {0} ', ref.title))
+                            repl = ref.refLink()
+                    else:
+                        repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    return
+
+                # Get the real url where we end (http redirects !)
+                redir = f.data.url
+                if redir != ref.link and \
+                   domain.findall(redir) == domain.findall(link):
+                    if soft404.search(redir) and \
+                       not soft404.search(ref.link):
+                        pywikibot.output(color_format(
+                            '{lightyellow}WARNING{default} : '
+                            'Redirect 404 : {0} ', ref.link))
                         return
+                    if dirIndex.match(redir) and \
+                       not dirIndex.match(ref.link):
+                        pywikibot.output(color_format(
+                            '{lightyellow}WARNING{default} : '
+                            'Redirect to root : {0} ', ref.link))
+                        return
+
+                if f.status_code != codes.ok:
+                    pywikibot.stdout('HTTP error ({}) for {} on {}'
+                                     .format(f.status_code, ref.url,
+                                             page.title(as_link=True)))
+                    # 410 Gone, indicates that the resource has been
+                    # purposely removed
+                    if f.status_code == 410 \
+                       or (f.status_code == 404
+                           and '\t{}\t'.format(
+                               ref.url) in self.dead_links):
+                        repl = ref.refDead()
+                        new_text = new_text.replace(match.group(), repl)
+                    return
+
+                linkedpagetext = f.raw
+            except UnicodeError:
+                # example:
+                # http://www.adminet.com/jo/20010615%C2%A6/ECOC0100037D.html
+                # in [[fr:Cyanure]]
+                pywikibot.output(color_format(
+                    '{lightred}Bad link{default} : {0} in {1}',
+                    ref.url, page.title(as_link=True)))
+                return
+            except (URLError,
+                    socket.error,
+                    IOError,
+                    httplib.error,
+                    pywikibot.FatalServerError,
+                    pywikibot.Server414Error,
+                    pywikibot.Server504Error) as e:
+                pywikibot.output("Can't retrieve page {} : {}"
+                                 .format(ref.url, e))
+                return
+
+            # remove <script>/<style>/comments/CDATA tags
+            linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
+
+            meta_content = self.META_CONTENT.search(linkedpagetext)
+            enc = []
+            s = None
+            if content_type:
+                # use charset from http header
+                s = self.CHARSET.search(content_type)
+            if meta_content:
+                tag = meta_content.group().decode()
+                # Prefer the contentType from the HTTP header :
+                if not content_type:
+                    content_type = tag
+                if not s:
+                    # use charset from html
+                    s = self.CHARSET.search(tag)
+            if s:
+                tmp = s.group('enc').strip('"' ').lower()
+                naked = re.sub(r'[ _-]', '', tmp)
+                # Convert to python correct encoding names
+                if naked == 'gb2312':
+                    enc.append('gbk')
+                elif naked == 'shiftjis':
+                    enc.append('shift jis 2004')
+                    enc.append('cp932')
+                elif naked == 'xeucjp':
+                    enc.append('euc-jp')
+                else:
+                    enc.append(tmp)
+            else:
+                pywikibot.output('No charset found for ' + ref.link)
+
+            if not content_type:
+                pywikibot.output('No content-type found for ' + ref.link)
+                return
+
+            if not self.MIME.search(content_type):
+                pywikibot.output(color_format(
+                    '{lightyellow}WARNING{default} : media : {0} ',
+                    ref.link))
+                repl = ref.refLink()
+                new_text = new_text.replace(match.group(), repl)
+                return
+
+            # Ugly hacks to try to survive when both server and page
+            # return no encoding.
+            # Uses most used encodings for each national suffix
+            if '.ru' in ref.link or '.su' in ref.link:
+                # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
+                # encoding, no page encoding
+                enc = enc + ['koi8-r', 'windows-1251']
+            elif '.jp' in ref.link:
+                enc.append('shift jis 2004')
+                enc.append('cp932')
+            elif '.kr' in ref.link:
+                enc.append('euc-kr')
+                enc.append('cp949')
+            elif '.zh' in ref.link:
+                enc.append('gbk')
+
+            if 'utf-8' not in enc:
+                enc.append('utf-8')
+            try:
+                u = linkedpagetext.decode(enc[0])   # Bug T69410
+            except (UnicodeDecodeError, LookupError) as e:
+                pywikibot.output('{} : Decoding error - {}'
+                                 .format(ref.link, e))
+                return
+
+            # Retrieves the first non empty string inside <title> tags
+            for m in self.TITLE.finditer(u):
+                t = m.group()
+                if t:
+                    ref.title = t
+                    ref.transform()
+                    if ref.title:
+                        break
+
+            if not ref.title:
+                repl = ref.refLink()
+                new_text = new_text.replace(match.group(), repl)
+                pywikibot.output('{} : No title found...'.format(ref.link))
+                return
+
+            # XXX Ugly hack
+            if 'Ã©' in ref.title:
+                repl = ref.refLink()
+                new_text = new_text.replace(match.group(), repl)
+                pywikibot.output('{} : Hybrid encoding...'
+                                 .format(ref.link))
+                return
+
+            if self.titleBlackList.match(ref.title):
+                repl = ref.refLink()
+                new_text = new_text.replace(match.group(), repl)
+                pywikibot.output(color_format(
+                    '{lightred}WARNING{default} {0} : '
+                    'Blacklisted title ({1})', ref.link, ref.title))
+                return
+
+            # Truncate long titles. 175 is arbitrary
+            if len(ref.title) > 175:
+                ref.title = ref.title[:175] + '...'
+
+            repl = ref.refTitle()
+            new_text = new_text.replace(match.group(), repl)
+
+        # Add <references/> when needed, but ignore templates !
+        if page.namespace != 10:
+            if self.norefbot.lacksReferences(new_text):
+                new_text = self.norefbot.addReferences(new_text)
+
+        new_text = self.deduplicator.process(new_text)
+        old_text = page.text
+
+        self.userPut(page, old_text, new_text, summary=self.msg,
+                     ignore_save_related_errors=True,
+                     ignore_server_errors=True)
+
+        if not self._save_counter:
+            return
+
+        if self.opt.limit and self._save_counter >= self.opt.limit:
+            pywikibot.output('Edited {} pages, stopping.'
+                             .format(self.opt.limit))
+            self.generator.close()
+
+        if self.site_stop_page and self._save_counter % 20 == 0:
+            self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
+            if self.stop_page.exists():
+                pywikibot.output(color_format(
+                    '{lightgreen}Checking stop page...{default}'))
+                actual_rev = self.stop_page.latest_revision_id
+                if actual_rev != self.stop_page_rev_id:
+                    pywikibot.output(
+                        '{} has been edited: Someone wants us to stop.'
+                        .format(self.stop_page.title(as_link=True)))
+                    self.generator.close()
def main(*args):
-- 
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/636460
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I613507e22967b248a9c1c5a136ecc64d46e48bb6
Gerrit-Change-Number: 636460
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt info@gno.de
Gerrit-Reviewer: D3r1ck01 xsavitar.wiki@aol.com
Gerrit-Reviewer: Rubin rubin@wikimedia.ru
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged