jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] Rename ReferencesRobot.run to treat

This enables quitting the script without raising an exception.

Bug: T196851
Bug: T171713
Change-Id: I613507e22967b248a9c1c5a136ecc64d46e48bb6
---
M scripts/reflinks.py
1 file changed, 247 insertions(+), 252 deletions(-)

diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 7b9c621..881db61 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -504,263 +504,258 @@
'http://www.twoevils.org/files/wikipedia/404-links.txt.gz\n'
'and to unzip it in the same directory')

- def run(self):
- """Run the Bot."""
- self.setup()
- editedpages = 0
- for page in self.generator:
- try:
- # Load the page's text from the wiki
- new_text = page.get()
- if not page.has_permission():
- pywikibot.output("You can't edit page "
- + page.title(as_link=True))
- continue
- except pywikibot.NoPage:
- pywikibot.output('Page {} not found'
- .format(page.title(as_link=True)))
- continue
- except pywikibot.IsRedirectPage:
- pywikibot.output('Page {} is a redirect'
- .format(page.title(as_link=True)))
- continue
+ def treat(self, page):
+ """Process one page."""
+ try:
+ # Load the page's text from the wiki
+ new_text = page.get()
+ if not page.has_permission():
+ pywikibot.output("You can't edit page "
+ + page.title(as_link=True))
+ return
+ except pywikibot.NoPage:
+ pywikibot.output('Page {} not found'
+ .format(page.title(as_link=True)))
+ return
+ except pywikibot.IsRedirectPage:
+ pywikibot.output('Page {} is a redirect'
+ .format(page.title(as_link=True)))
+ return

- # for each link to change
- for match in linksInRef.finditer(
- textlib.removeDisabledParts(page.get())):
+ # for each link to change
+ for match in linksInRef.finditer(
+ textlib.removeDisabledParts(page.get())):

- link = match.group('url')
- # debugging purpose
- # print link
- if 'jstor.org' in link:
- # TODO: Clean URL blacklist
- continue
-
- ref = RefLink(link, match.group('name'), site=self.site)
-
- try:
- f = comms.http.fetch(
- ref.url, use_fake_user_agent=self._use_fake_user_agent)
-
- # Try to get Content-Type from server
- content_type = f.response_headers.get('content-type')
- if content_type and not self.MIME.search(content_type):
- if ref.link.lower().endswith('.pdf') and \
- not self.opt.ignorepdf:
- # If file has a PDF suffix
- self.getPDFTitle(ref, f)
- else:
- pywikibot.output(color_format(
- '{lightyellow}WARNING{default} : '
- 'media : {0} ', ref.link))
- if ref.title:
- if not re.match(
- '(?i) *microsoft (word|excel|visio)',
- ref.title):
- ref.transform(ispdf=True)
- repl = ref.refTitle()
- else:
- pywikibot.output(color_format(
- '{lightyellow}WARNING{default} : '
- 'PDF title blacklisted : {0} ', ref.title))
- repl = ref.refLink()
- else:
- repl = ref.refLink()
- new_text = new_text.replace(match.group(), repl)
- continue
-
- # Get the real url where we end (http redirects !)
- redir = f.data.url
- if redir != ref.link and \
- domain.findall(redir) == domain.findall(link):
- if soft404.search(redir) and \
- not soft404.search(ref.link):
- pywikibot.output(color_format(
- '{lightyellow}WARNING{default} : '
- 'Redirect 404 : {0} ', ref.link))
- continue
- if dirIndex.match(redir) and \
- not dirIndex.match(ref.link):
- pywikibot.output(color_format(
- '{lightyellow}WARNING{default} : '
- 'Redirect to root : {0} ', ref.link))
- continue
-
- if f.status_code != codes.ok:
- pywikibot.stdout('HTTP error ({}) for {} on {}'
- .format(f.status_code, ref.url,
- page.title(as_link=True)))
- # 410 Gone, indicates that the resource has been
- # purposely removed
- if f.status_code == 410 \
- or (f.status_code == 404
- and '\t{}\t'.format(
- ref.url) in self.dead_links):
- repl = ref.refDead()
- new_text = new_text.replace(match.group(), repl)
- continue
-
- linkedpagetext = f.raw
- except UnicodeError:
- # example:
- # http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
- # in [[fr:Cyanure]]
- pywikibot.output(color_format(
- '{lightred}Bad link{default} : {0} in {1}',
- ref.url, page.title(as_link=True)))
- continue
- except (URLError,
- socket.error,
- IOError,
- httplib.error,
- pywikibot.FatalServerError,
- pywikibot.Server414Error,
- pywikibot.Server504Error) as e:
- pywikibot.output("Can't retrieve page {} : {}"
- .format(ref.url, e))
- continue
-
- # remove <script>/<style>/comments/CDATA tags
- linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
-
- meta_content = self.META_CONTENT.search(linkedpagetext)
- enc = []
- s = None
- if content_type:
- # use charset from http header
- s = self.CHARSET.search(content_type)
- if meta_content:
- tag = meta_content.group().decode()
- # Prefer the contentType from the HTTP header :
- if not content_type:
- content_type = tag
- if not s:
- # use charset from html
- s = self.CHARSET.search(tag)
- if s:
- tmp = s.group('enc').strip('"\' ').lower()
- naked = re.sub(r'[ _\-]', '', tmp)
- # Convert to python correct encoding names
- if naked == 'gb2312':
- enc.append('gbk')
- elif naked == 'shiftjis':
- enc.append('shift jis 2004')
- enc.append('cp932')
- elif naked == 'xeucjp':
- enc.append('euc-jp')
- else:
- enc.append(tmp)
- else:
- pywikibot.output('No charset found for ' + ref.link)
-
- if not content_type:
- pywikibot.output('No content-type found for ' + ref.link)
- continue
-
- if not self.MIME.search(content_type):
- pywikibot.output(color_format(
- '{lightyellow}WARNING{default} : media : {0} ',
- ref.link))
- repl = ref.refLink()
- new_text = new_text.replace(match.group(), repl)
- continue
-
- # Ugly hacks to try to survive when both server and page
- # return no encoding.
- # Uses most used encodings for each national suffix
- if '.ru' in ref.link or '.su' in ref.link:
- # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
- # encoding, no page encoding
- enc = enc + ['koi8-r', 'windows-1251']
- elif '.jp' in ref.link:
- enc.append('shift jis 2004')
- enc.append('cp932')
- elif '.kr' in ref.link:
- enc.append('euc-kr')
- enc.append('cp949')
- elif '.zh' in ref.link:
- enc.append('gbk')
-
- if 'utf-8' not in enc:
- enc.append('utf-8')
- try:
- u = linkedpagetext.decode(enc[0]) # Bug T69410
- except (UnicodeDecodeError, LookupError) as e:
- pywikibot.output('{} : Decoding error - {}'
- .format(ref.link, e))
- continue
-
- # Retrieves the first non empty string inside <title> tags
- for m in self.TITLE.finditer(u):
- t = m.group()
- if t:
- ref.title = t
- ref.transform()
- if ref.title:
- break
-
- if not ref.title:
- repl = ref.refLink()
- new_text = new_text.replace(match.group(), repl)
- pywikibot.output('{} : No title found...'.format(ref.link))
- continue
-
- # XXX Ugly hack
- if 'é' in ref.title:
- repl = ref.refLink()
- new_text = new_text.replace(match.group(), repl)
- pywikibot.output('{} : Hybrid encoding...'
- .format(ref.link))
- continue
-
- if self.titleBlackList.match(ref.title):
- repl = ref.refLink()
- new_text = new_text.replace(match.group(), repl)
- pywikibot.output(color_format(
- '{lightred}WARNING{default} {0} : '
- 'Blacklisted title ({1})', ref.link, ref.title))
- continue
-
- # Truncate long titles. 175 is arbitrary
- if len(ref.title) > 175:
- ref.title = ref.title[:175] + '...'
-
- repl = ref.refTitle()
- new_text = new_text.replace(match.group(), repl)
-
- # Add <references/> when needed, but ignore templates !
- if page.namespace != 10:
- if self.norefbot.lacksReferences(new_text):
- new_text = self.norefbot.addReferences(new_text)
-
- new_text = self.deduplicator.process(new_text)
- old_text = page.text
-
- self.userPut(page, old_text, new_text, summary=self.msg,
- ignore_save_related_errors=True,
- ignore_server_errors=True)
-
- if new_text == old_text:
- continue
- else:
- editedpages += 1
-
- if self.opt.limit and editedpages >= self.opt.limit:
- pywikibot.output('Edited {} pages, stopping.'
- .format(self.opt.limit))
+ link = match.group('url')
+ # debugging purpose
+ # print link
+ if 'jstor.org' in link:
+ # TODO: Clean URL blacklist
return

- if self.site_stop_page and editedpages % 20 == 0:
- self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
- if self.stop_page.exists():
- pywikibot.output(color_format(
- '{lightgreen}Checking stop page...{default}'))
- actual_rev = self.stop_page.latest_revision_id
- if actual_rev != self.stop_page_rev_id:
- pywikibot.output(
- '{} has been edited: Someone wants us to stop.'
- .format(self.stop_page.title(as_link=True)))
+ ref = RefLink(link, match.group('name'), site=self.site)
+
+ try:
+ f = comms.http.fetch(
+ ref.url, use_fake_user_agent=self._use_fake_user_agent)
+
+ # Try to get Content-Type from server
+ content_type = f.response_headers.get('content-type')
+ if content_type and not self.MIME.search(content_type):
+ if ref.link.lower().endswith('.pdf') and \
+ not self.opt.ignorepdf:
+ # If file has a PDF suffix
+ self.getPDFTitle(ref, f)
+ else:
+ pywikibot.output(color_format(
+ '{lightyellow}WARNING{default} : '
+ 'media : {0} ', ref.link))
+ if ref.title:
+ if not re.match(
+ '(?i) *microsoft (word|excel|visio)',
+ ref.title):
+ ref.transform(ispdf=True)
+ repl = ref.refTitle()
+ else:
+ pywikibot.output(color_format(
+ '{lightyellow}WARNING{default} : '
+ 'PDF title blacklisted : {0} ', ref.title))
+ repl = ref.refLink()
+ else:
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ return
+
+ # Get the real url where we end (http redirects !)
+ redir = f.data.url
+ if redir != ref.link and \
+ domain.findall(redir) == domain.findall(link):
+ if soft404.search(redir) and \
+ not soft404.search(ref.link):
+ pywikibot.output(color_format(
+ '{lightyellow}WARNING{default} : '
+ 'Redirect 404 : {0} ', ref.link))
return
+ if dirIndex.match(redir) and \
+ not dirIndex.match(ref.link):
+ pywikibot.output(color_format(
+ '{lightyellow}WARNING{default} : '
+ 'Redirect to root : {0} ', ref.link))
+ return
+
+ if f.status_code != codes.ok:
+ pywikibot.stdout('HTTP error ({}) for {} on {}'
+ .format(f.status_code, ref.url,
+ page.title(as_link=True)))
+ # 410 Gone, indicates that the resource has been
+ # purposely removed
+ if f.status_code == 410 \
+ or (f.status_code == 404
+ and '\t{}\t'.format(
+ ref.url) in self.dead_links):
+ repl = ref.refDead()
+ new_text = new_text.replace(match.group(), repl)
+ return
+
+ linkedpagetext = f.raw
+ except UnicodeError:
+ # example:
+ # http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
+ # in [[fr:Cyanure]]
+ pywikibot.output(color_format(
+ '{lightred}Bad link{default} : {0} in {1}',
+ ref.url, page.title(as_link=True)))
+ return
+ except (URLError,
+ socket.error,
+ IOError,
+ httplib.error,
+ pywikibot.FatalServerError,
+ pywikibot.Server414Error,
+ pywikibot.Server504Error) as e:
+ pywikibot.output("Can't retrieve page {} : {}"
+ .format(ref.url, e))
+ return
+
+ # remove <script>/<style>/comments/CDATA tags
+ linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
+
+ meta_content = self.META_CONTENT.search(linkedpagetext)
+ enc = []
+ s = None
+ if content_type:
+ # use charset from http header
+ s = self.CHARSET.search(content_type)
+ if meta_content:
+ tag = meta_content.group().decode()
+ # Prefer the contentType from the HTTP header :
+ if not content_type:
+ content_type = tag
+ if not s:
+ # use charset from html
+ s = self.CHARSET.search(tag)
+ if s:
+ tmp = s.group('enc').strip('"\' ').lower()
+ naked = re.sub(r'[ _\-]', '', tmp)
+ # Convert to python correct encoding names
+ if naked == 'gb2312':
+ enc.append('gbk')
+ elif naked == 'shiftjis':
+ enc.append('shift jis 2004')
+ enc.append('cp932')
+ elif naked == 'xeucjp':
+ enc.append('euc-jp')
+ else:
+ enc.append(tmp)
+ else:
+ pywikibot.output('No charset found for ' + ref.link)
+
+ if not content_type:
+ pywikibot.output('No content-type found for ' + ref.link)
+ return
+
+ if not self.MIME.search(content_type):
+ pywikibot.output(color_format(
+ '{lightyellow}WARNING{default} : media : {0} ',
+ ref.link))
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ return
+
+ # Ugly hacks to try to survive when both server and page
+ # return no encoding.
+ # Uses most used encodings for each national suffix
+ if '.ru' in ref.link or '.su' in ref.link:
+ # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
+ # encoding, no page encoding
+ enc = enc + ['koi8-r', 'windows-1251']
+ elif '.jp' in ref.link:
+ enc.append('shift jis 2004')
+ enc.append('cp932')
+ elif '.kr' in ref.link:
+ enc.append('euc-kr')
+ enc.append('cp949')
+ elif '.zh' in ref.link:
+ enc.append('gbk')
+
+ if 'utf-8' not in enc:
+ enc.append('utf-8')
+ try:
+ u = linkedpagetext.decode(enc[0]) # Bug T69410
+ except (UnicodeDecodeError, LookupError) as e:
+ pywikibot.output('{} : Decoding error - {}'
+ .format(ref.link, e))
+ return
+
+ # Retrieves the first non empty string inside <title> tags
+ for m in self.TITLE.finditer(u):
+ t = m.group()
+ if t:
+ ref.title = t
+ ref.transform()
+ if ref.title:
+ break
+
+ if not ref.title:
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ pywikibot.output('{} : No title found...'.format(ref.link))
+ return
+
+ # XXX Ugly hack
+ if 'é' in ref.title:
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ pywikibot.output('{} : Hybrid encoding...'
+ .format(ref.link))
+ return
+
+ if self.titleBlackList.match(ref.title):
+ repl = ref.refLink()
+ new_text = new_text.replace(match.group(), repl)
+ pywikibot.output(color_format(
+ '{lightred}WARNING{default} {0} : '
+ 'Blacklisted title ({1})', ref.link, ref.title))
+ return
+
+ # Truncate long titles. 175 is arbitrary
+ if len(ref.title) > 175:
+ ref.title = ref.title[:175] + '...'
+
+ repl = ref.refTitle()
+ new_text = new_text.replace(match.group(), repl)
+
+ # Add <references/> when needed, but ignore templates !
+ if page.namespace != 10:
+ if self.norefbot.lacksReferences(new_text):
+ new_text = self.norefbot.addReferences(new_text)
+
+ new_text = self.deduplicator.process(new_text)
+ old_text = page.text
+
+ self.userPut(page, old_text, new_text, summary=self.msg,
+ ignore_save_related_errors=True,
+ ignore_server_errors=True)
+
+ if not self._save_counter:
+ return
+
+ if self.opt.limit and self._save_counter >= self.opt.limit:
+ pywikibot.output('Edited {} pages, stopping.'
+ .format(self.opt.limit))
+ self.generator.close()
+
+ if self.site_stop_page and self._save_counter % 20 == 0:
+ self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
+ if self.stop_page.exists():
+ pywikibot.output(color_format(
+ '{lightgreen}Checking stop page...{default}'))
+ actual_rev = self.stop_page.latest_revision_id
+ if actual_rev != self.stop_page_rev_id:
+ pywikibot.output(
+ '{} has been edited: Someone wants us to stop.'
+ .format(self.stop_page.title(as_link=True)))
+ self.generator.close()


def main(*args):

To view, visit change 636460. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I613507e22967b248a9c1c5a136ecc64d46e48bb6
Gerrit-Change-Number: 636460
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Rubin <rubin@wikimedia.ru>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged