jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/636460 )
Change subject: [IMPR] Rename ReferencesRobot.run to treat ......................................................................
[IMPR] Rename ReferencesRobot.run to treat
This enables quitting the script without raising an exception.
Bug: T196851 Bug: T171713 Change-Id: I613507e22967b248a9c1c5a136ecc64d46e48bb6 --- M scripts/reflinks.py 1 file changed, 247 insertions(+), 252 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/scripts/reflinks.py b/scripts/reflinks.py index 7b9c621..881db61 100755 --- a/scripts/reflinks.py +++ b/scripts/reflinks.py @@ -504,263 +504,258 @@ 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz%5Cn' 'and to unzip it in the same directory')
- def run(self): - """Run the Bot.""" - self.setup() - editedpages = 0 - for page in self.generator: - try: - # Load the page's text from the wiki - new_text = page.get() - if not page.has_permission(): - pywikibot.output("You can't edit page " - + page.title(as_link=True)) - continue - except pywikibot.NoPage: - pywikibot.output('Page {} not found' - .format(page.title(as_link=True))) - continue - except pywikibot.IsRedirectPage: - pywikibot.output('Page {} is a redirect' - .format(page.title(as_link=True))) - continue + def treat(self, page): + """Process one page.""" + try: + # Load the page's text from the wiki + new_text = page.get() + if not page.has_permission(): + pywikibot.output("You can't edit page " + + page.title(as_link=True)) + return + except pywikibot.NoPage: + pywikibot.output('Page {} not found' + .format(page.title(as_link=True))) + return + except pywikibot.IsRedirectPage: + pywikibot.output('Page {} is a redirect' + .format(page.title(as_link=True))) + return
- # for each link to change - for match in linksInRef.finditer( - textlib.removeDisabledParts(page.get())): + # for each link to change + for match in linksInRef.finditer( + textlib.removeDisabledParts(page.get())):
- link = match.group('url') - # debugging purpose - # print link - if 'jstor.org' in link: - # TODO: Clean URL blacklist - continue - - ref = RefLink(link, match.group('name'), site=self.site) - - try: - f = comms.http.fetch( - ref.url, use_fake_user_agent=self._use_fake_user_agent) - - # Try to get Content-Type from server - content_type = f.response_headers.get('content-type') - if content_type and not self.MIME.search(content_type): - if ref.link.lower().endswith('.pdf') and \ - not self.opt.ignorepdf: - # If file has a PDF suffix - self.getPDFTitle(ref, f) - else: - pywikibot.output(color_format( - '{lightyellow}WARNING{default} : ' - 'media : {0} ', ref.link)) - if ref.title: - if not re.match( - '(?i) *microsoft (word|excel|visio)', - ref.title): - ref.transform(ispdf=True) - repl = ref.refTitle() - else: - pywikibot.output(color_format( - '{lightyellow}WARNING{default} : ' - 'PDF title blacklisted : {0} ', ref.title)) - repl = ref.refLink() - else: - repl = ref.refLink() - new_text = new_text.replace(match.group(), repl) - continue - - # Get the real url where we end (http redirects !) - redir = f.data.url - if redir != ref.link and \ - domain.findall(redir) == domain.findall(link): - if soft404.search(redir) and \ - not soft404.search(ref.link): - pywikibot.output(color_format( - '{lightyellow}WARNING{default} : ' - 'Redirect 404 : {0} ', ref.link)) - continue - if dirIndex.match(redir) and \ - not dirIndex.match(ref.link): - pywikibot.output(color_format( - '{lightyellow}WARNING{default} : ' - 'Redirect to root : {0} ', ref.link)) - continue - - if f.status_code != codes.ok: - pywikibot.stdout('HTTP error ({}) for {} on {}' - .format(f.status_code, ref.url, - page.title(as_link=True))) - # 410 Gone, indicates that the resource has been - # purposely removed - if f.status_code == 410 \ - or (f.status_code == 404 - and '\t{}\t'.format( - ref.url) in self.dead_links): - repl = ref.refDead() - new_text = new_text.replace(match.group(), repl) - continue - - linkedpagetext = f.raw - except UnicodeError: - # example: - # http://www.adminet.com/jo/20010615%C2%A6/ECOC0100037D.html - # in [[fr:Cyanure]] - pywikibot.output(color_format( - '{lightred}Bad link{default} : {0} in {1}', - ref.url, page.title(as_link=True))) - continue - except (URLError, - socket.error, - IOError, - httplib.error, - pywikibot.FatalServerError, - pywikibot.Server414Error, - pywikibot.Server504Error) as e: - pywikibot.output("Can't retrieve page {} : {}" - .format(ref.url, e)) - continue - - # remove <script>/<style>/comments/CDATA tags - linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext) - - meta_content = self.META_CONTENT.search(linkedpagetext) - enc = [] - s = None - if content_type: - # use charset from http header - s = self.CHARSET.search(content_type) - if meta_content: - tag = meta_content.group().decode() - # Prefer the contentType from the HTTP header : - if not content_type: - content_type = tag - if not s: - # use charset from html - s = self.CHARSET.search(tag) - if s: - tmp = s.group('enc').strip('"' ').lower() - naked = re.sub(r'[ _-]', '', tmp) - # Convert to python correct encoding names - if naked == 'gb2312': - enc.append('gbk') - elif naked == 'shiftjis': - enc.append('shift jis 2004') - enc.append('cp932') - elif naked == 'xeucjp': - enc.append('euc-jp') - else: - enc.append(tmp) - else: - pywikibot.output('No charset found for ' + ref.link) - - if not content_type: - pywikibot.output('No content-type found for ' + ref.link) - continue - - if not self.MIME.search(content_type): - pywikibot.output(color_format( - '{lightyellow}WARNING{default} : media : {0} ', - ref.link)) - repl = ref.refLink() - new_text = new_text.replace(match.group(), repl) - continue - - # Ugly hacks to try to survive when both server and page - # return no encoding. - # Uses most used encodings for each national suffix - if '.ru' in ref.link or '.su' in ref.link: - # see http://www.sci.aha.ru/ATL/ra13a.htm : no server - # encoding, no page encoding - enc = enc + ['koi8-r', 'windows-1251'] - elif '.jp' in ref.link: - enc.append('shift jis 2004') - enc.append('cp932') - elif '.kr' in ref.link: - enc.append('euc-kr') - enc.append('cp949') - elif '.zh' in ref.link: - enc.append('gbk') - - if 'utf-8' not in enc: - enc.append('utf-8') - try: - u = linkedpagetext.decode(enc[0]) # Bug T69410 - except (UnicodeDecodeError, LookupError) as e: - pywikibot.output('{} : Decoding error - {}' - .format(ref.link, e)) - continue - - # Retrieves the first non empty string inside <title> tags - for m in self.TITLE.finditer(u): - t = m.group() - if t: - ref.title = t - ref.transform() - if ref.title: - break - - if not ref.title: - repl = ref.refLink() - new_text = new_text.replace(match.group(), repl) - pywikibot.output('{} : No title found...'.format(ref.link)) - continue - - # XXX Ugly hack - if 'é' in ref.title: - repl = ref.refLink() - new_text = new_text.replace(match.group(), repl) - pywikibot.output('{} : Hybrid encoding...' - .format(ref.link)) - continue - - if self.titleBlackList.match(ref.title): - repl = ref.refLink() - new_text = new_text.replace(match.group(), repl) - pywikibot.output(color_format( - '{lightred}WARNING{default} {0} : ' - 'Blacklisted title ({1})', ref.link, ref.title)) - continue - - # Truncate long titles. 175 is arbitrary - if len(ref.title) > 175: - ref.title = ref.title[:175] + '...' - - repl = ref.refTitle() - new_text = new_text.replace(match.group(), repl) - - # Add <references/> when needed, but ignore templates ! - if page.namespace != 10: - if self.norefbot.lacksReferences(new_text): - new_text = self.norefbot.addReferences(new_text) - - new_text = self.deduplicator.process(new_text) - old_text = page.text - - self.userPut(page, old_text, new_text, summary=self.msg, - ignore_save_related_errors=True, - ignore_server_errors=True) - - if new_text == old_text: - continue - else: - editedpages += 1 - - if self.opt.limit and editedpages >= self.opt.limit: - pywikibot.output('Edited {} pages, stopping.' - .format(self.opt.limit)) + link = match.group('url') + # debugging purpose + # print link + if 'jstor.org' in link: + # TODO: Clean URL blacklist return
- if self.site_stop_page and editedpages % 20 == 0: - self.stop_page = pywikibot.Page(self.site, self.site_stop_page) - if self.stop_page.exists(): - pywikibot.output(color_format( - '{lightgreen}Checking stop page...{default}')) - actual_rev = self.stop_page.latest_revision_id - if actual_rev != self.stop_page_rev_id: - pywikibot.output( - '{} has been edited: Someone wants us to stop.' - .format(self.stop_page.title(as_link=True))) + ref = RefLink(link, match.group('name'), site=self.site) + + try: + f = comms.http.fetch( + ref.url, use_fake_user_agent=self._use_fake_user_agent) + + # Try to get Content-Type from server + content_type = f.response_headers.get('content-type') + if content_type and not self.MIME.search(content_type): + if ref.link.lower().endswith('.pdf') and \ + not self.opt.ignorepdf: + # If file has a PDF suffix + self.getPDFTitle(ref, f) + else: + pywikibot.output(color_format( + '{lightyellow}WARNING{default} : ' + 'media : {0} ', ref.link)) + if ref.title: + if not re.match( + '(?i) *microsoft (word|excel|visio)', + ref.title): + ref.transform(ispdf=True) + repl = ref.refTitle() + else: + pywikibot.output(color_format( + '{lightyellow}WARNING{default} : ' + 'PDF title blacklisted : {0} ', ref.title)) + repl = ref.refLink() + else: + repl = ref.refLink() + new_text = new_text.replace(match.group(), repl) + return + + # Get the real url where we end (http redirects !) + redir = f.data.url + if redir != ref.link and \ + domain.findall(redir) == domain.findall(link): + if soft404.search(redir) and \ + not soft404.search(ref.link): + pywikibot.output(color_format( + '{lightyellow}WARNING{default} : ' + 'Redirect 404 : {0} ', ref.link)) return + if dirIndex.match(redir) and \ + not dirIndex.match(ref.link): + pywikibot.output(color_format( + '{lightyellow}WARNING{default} : ' + 'Redirect to root : {0} ', ref.link)) + return + + if f.status_code != codes.ok: + pywikibot.stdout('HTTP error ({}) for {} on {}' + .format(f.status_code, ref.url, + page.title(as_link=True))) + # 410 Gone, indicates that the resource has been + # purposely removed + if f.status_code == 410 \ + or (f.status_code == 404 + and '\t{}\t'.format( + ref.url) in self.dead_links): + repl = ref.refDead() + new_text = new_text.replace(match.group(), repl) + return + + linkedpagetext = f.raw + except UnicodeError: + # example: + # http://www.adminet.com/jo/20010615%C2%A6/ECOC0100037D.html + # in [[fr:Cyanure]] + pywikibot.output(color_format( + '{lightred}Bad link{default} : {0} in {1}', + ref.url, page.title(as_link=True))) + return + except (URLError, + socket.error, + IOError, + httplib.error, + pywikibot.FatalServerError, + pywikibot.Server414Error, + pywikibot.Server504Error) as e: + pywikibot.output("Can't retrieve page {} : {}" + .format(ref.url, e)) + return + + # remove <script>/<style>/comments/CDATA tags + linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext) + + meta_content = self.META_CONTENT.search(linkedpagetext) + enc = [] + s = None + if content_type: + # use charset from http header + s = self.CHARSET.search(content_type) + if meta_content: + tag = meta_content.group().decode() + # Prefer the contentType from the HTTP header : + if not content_type: + content_type = tag + if not s: + # use charset from html + s = self.CHARSET.search(tag) + if s: + tmp = s.group('enc').strip('"' ').lower() + naked = re.sub(r'[ _-]', '', tmp) + # Convert to python correct encoding names + if naked == 'gb2312': + enc.append('gbk') + elif naked == 'shiftjis': + enc.append('shift jis 2004') + enc.append('cp932') + elif naked == 'xeucjp': + enc.append('euc-jp') + else: + enc.append(tmp) + else: + pywikibot.output('No charset found for ' + ref.link) + + if not content_type: + pywikibot.output('No content-type found for ' + ref.link) + return + + if not self.MIME.search(content_type): + pywikibot.output(color_format( + '{lightyellow}WARNING{default} : media : {0} ', + ref.link)) + repl = ref.refLink() + new_text = new_text.replace(match.group(), repl) + return + + # Ugly hacks to try to survive when both server and page + # return no encoding. + # Uses most used encodings for each national suffix + if '.ru' in ref.link or '.su' in ref.link: + # see http://www.sci.aha.ru/ATL/ra13a.htm : no server + # encoding, no page encoding + enc = enc + ['koi8-r', 'windows-1251'] + elif '.jp' in ref.link: + enc.append('shift jis 2004') + enc.append('cp932') + elif '.kr' in ref.link: + enc.append('euc-kr') + enc.append('cp949') + elif '.zh' in ref.link: + enc.append('gbk') + + if 'utf-8' not in enc: + enc.append('utf-8') + try: + u = linkedpagetext.decode(enc[0]) # Bug T69410 + except (UnicodeDecodeError, LookupError) as e: + pywikibot.output('{} : Decoding error - {}' + .format(ref.link, e)) + return + + # Retrieves the first non empty string inside <title> tags + for m in self.TITLE.finditer(u): + t = m.group() + if t: + ref.title = t + ref.transform() + if ref.title: + break + + if not ref.title: + repl = ref.refLink() + new_text = new_text.replace(match.group(), repl) + pywikibot.output('{} : No title found...'.format(ref.link)) + return + + # XXX Ugly hack + if 'é' in ref.title: + repl = ref.refLink() + new_text = new_text.replace(match.group(), repl) + pywikibot.output('{} : Hybrid encoding...' + .format(ref.link)) + return + + if self.titleBlackList.match(ref.title): + repl = ref.refLink() + new_text = new_text.replace(match.group(), repl) + pywikibot.output(color_format( + '{lightred}WARNING{default} {0} : ' + 'Blacklisted title ({1})', ref.link, ref.title)) + return + + # Truncate long titles. 175 is arbitrary + if len(ref.title) > 175: + ref.title = ref.title[:175] + '...' + + repl = ref.refTitle() + new_text = new_text.replace(match.group(), repl) + + # Add <references/> when needed, but ignore templates ! + if page.namespace != 10: + if self.norefbot.lacksReferences(new_text): + new_text = self.norefbot.addReferences(new_text) + + new_text = self.deduplicator.process(new_text) + old_text = page.text + + self.userPut(page, old_text, new_text, summary=self.msg, + ignore_save_related_errors=True, + ignore_server_errors=True) + + if not self._save_counter: + return + + if self.opt.limit and self._save_counter >= self.opt.limit: + pywikibot.output('Edited {} pages, stopping.' + .format(self.opt.limit)) + self.generator.close() + + if self.site_stop_page and self._save_counter % 20 == 0: + self.stop_page = pywikibot.Page(self.site, self.site_stop_page) + if self.stop_page.exists(): + pywikibot.output(color_format( + '{lightgreen}Checking stop page...{default}')) + actual_rev = self.stop_page.latest_revision_id + if actual_rev != self.stop_page_rev_id: + pywikibot.output( + '{} has been edited: Someone wants us to stop.' + .format(self.stop_page.title(as_link=True))) + self.generator.close()
def main(*args):
pywikibot-commits@lists.wikimedia.org