Strainu has submitted this change and it was merged.
Change subject: Port spamremove.py to core
......................................................................
Port spamremove.py to core
Change-Id: I6c60bc70d5771080168f7df62192151ed16e83a3
---
A scripts/spamremove.py
1 file changed, 130 insertions(+), 0 deletions(-)
Approvals:
Strainu: Verified; Looks good to me, approved
jenkins-bot: Checked
diff --git a/scripts/spamremove.py b/scripts/spamremove.py
new file mode 100755
index 0000000..8ee5397
--- /dev/null
+++ b/scripts/spamremove.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/python
+
+"""
+Script to remove links that are being or have been spammed.
+Usage:
+
+spamremove.py www.spammedsite.com
+
+It will use Special:Linksearch to find the pages on the wiki that link to
+that site, then for each page make a proposed change consisting of removing
+all the lines where that url occurs. You can choose to:
+* accept the changes as proposed
+* edit the page yourself to remove the offending link
+* not change the page in question
+
+Command line options:
+-automatic: Do not ask, but remove the lines automatically. Be very careful
+ in using this option!
+
+-namespace: Filters the search to a given namespace. If this is specified
+ multiple times it will search all given namespaces
+
+"""
+
+#
+# (C) Pywikipedia bot team, 2007-2010
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+#
+
+import pywikibot
+from pywikibot import config
+from pywikibot import pagegenerators
+import editarticle
+import sys
+
+
+def main():
+ automatic = False
+ namespaces = []
+ msg = {
+ 'ar': u'إزالة الوصلات إلى موقع سبام %s',
+ 'de': u'Entferne in Spam-Blacklist eingetragenen Weblink auf %s',
+ 'en': u'Removing links to spamming site %s',
+ 'es': u'Removiendo enlaces a sitio publicitario %s',
+ 'fa': u'حذف پیوند به وبگاه هرزنگاری %s',
+ 'he': u'מסיר קישורים לאתר ספאם %s',
+ 'fr': u'Suppression du lien blacklisté %s',
+ 'it': u'Rimuovo link contenuto nella Spam-Blacklist %s',
+ 'ja': u'ロボットによる: 迷惑リンク削除 %s',
+ 'nl': u'Links naar gespamde site: %s verwijderd',
+ 'pt': u'Removendo links de spam do site %s',
+ 'ta': u'எரிதமாக இணைக்கப்பட்ட %s இணையத்தளம் நீக்கப்பட்டது',
+ 'vi': u'xóa các liên kết đến website spam %s',
+ 'zh': u'機器人: 移除廣告黑名單連結 %s',
+ }
+ spamSite = ''
+ for arg in pywikibot.handleArgs():
+ if arg.startswith("-automatic"):
+ automatic = True
+ elif arg.startswith('-namespace:'):
+ try:
+ namespaces.append(int(arg[len('-namespace:'):]))
+ except ValueError:
+ namespaces.append(arg[len('-namespace:'):])
+ else:
+ spamSite = arg
+ if not automatic:
+ config.put_throttle = 1
+ if not spamSite:
+ pywikibot.showHelp('spamremove')
+ pywikibot.output(u"No spam site specified.")
+ sys.exit()
+ mysite = pywikibot.getSite()
+ pages = list(set(mysite.exturlusage(spamSite)))
+ if namespaces:
+ pages = list(set(pagegenerators.NamespaceFilterPageGenerator(pages,
+ namespaces)))
+ if len(pages) == 0:
+ pywikibot.output('No page found.')
+ else:
+ pywikibot.output('%d pages found.' % len(pages))
+ for p in pages:
+ text = p.get()
+ if not spamSite in text:
+ continue
+ # Show the title of the page we're working on.
+ # Highlight the title in purple.
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
+ % p.title())
+ lines = text.split('\n')
+ newpage = []
+ lastok = ""
+ for line in lines:
+ if spamSite in line:
+ if lastok:
+ pywikibot.output(lastok)
+ pywikibot.output('\03{lightred}%s\03{default}' % line)
+ lastok = None
+ else:
+ newpage.append(line)
+ if line.strip():
+ if lastok is None:
+ pywikibot.output(line)
+ lastok = line
+ if automatic:
+ answer = "y"
+ else:
+ answer = pywikibot.inputChoice(u'\nDelete the red lines?',
+ ['yes', 'no', 'edit'],
+ ['y', 'N', 'e'], 'n')
+ if answer == "n":
+ continue
+ elif answer == "e":
+ editor = editarticle.TextEditor()
+ newtext = editor.edit(text, highlight=spamSite,
+ jumpIndex=text.find(spamSite))
+ else:
+ newtext = "\n".join(newpage)
+ if newtext != text:
+ p.put(newtext, pywikibot.translate(mysite, msg) % spamSite)
+
+try:
+ main()
+finally:
+ pywikibot.stopme()
--
To view, visit https://gerrit.wikimedia.org/r/102555
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I6c60bc70d5771080168f7df62192151ed16e83a3
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Gerrit Patch Uploader <gerritpatchuploader(a)gmail.com>
Gerrit-Reviewer: Gerrit Patch Uploader <gerritpatchuploader(a)gmail.com>
Gerrit-Reviewer: Guoguo12 <Guoguo12(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Strainu <wiki(a)strainu.ro>
Gerrit-Reviewer: jenkins-bot
Strainu has submitted this change and it was merged.
Change subject: Port makecat.py to core
......................................................................
Port makecat.py to core
Change-Id: Icb7c690bf3e8625ad4a10e0f3e9f7b523f54c059
---
A scripts/makecat.py
1 file changed, 304 insertions(+), 0 deletions(-)
Approvals:
Strainu: Verified; Looks good to me, approved
diff --git a/scripts/makecat.py b/scripts/makecat.py
new file mode 100644
index 0000000..e269620
--- /dev/null
+++ b/scripts/makecat.py
@@ -0,0 +1,304 @@
+# -*- coding: UTF-8 -*-
+"""
+This bot takes as its argument (or, if no argument is given, asks for it), the
+name of a new or existing category. It will then try to find new articles for
+this category (pages linked to and from pages already in the category), asking
+the user which pages to include and which not.
+
+Arguments:
+ -nodates automatically skip all pages that are years or dates (years
+ only work AD, dates only for certain languages)
+ -forward only check pages linked from pages already in the category,
+ not pages linking to them. Is less precise but quite a bit
+ faster.
+ -exist only ask about pages that do actually exist; drop any
+ titles of non-existing pages silently. If -forward is chosen,
+ -exist is automatically implied.
+ -keepparent do not remove parent categories of the category to be
+ worked on.
+ -all work on all pages (default: only main namespace)
+
+When running the bot, you will get one by one a number by pages. You can
+choose:
+Y(es) - include the page
+N(o) - do not include the page or
+I(gnore) - do not include the page, but if you meet it again, ask again.
+X - add the page, but do not check links to and from it
+Other possiblities:
+A(dd) - add another page, which may have been one that was included before
+C(heck) - check links to and from the page, but do not add the page itself
+R(emove) - remove a page that is already in the list
+L(ist) - show current list of pages to include or to check
+"""
+
+# (C) Andre Engels, 2004
+# (C) Pywikipedia bot team 2005-2010
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+#
+
+import codecs
+import pywikibot
+from pywikibot import date, catlib, pagegenerators, i18n
+
+
+def rawtoclean(c):
+ #Given the 'raw' category, provides the 'clean' category
+ c2 = c.title().split('|')[0]
+ return pywikibot.Page(mysite, c2)
+
+
+def isdate(s):
+ """returns true iff s is a date or year
+ """
+ dict, val = date.getAutoFormat(pywikibot.getSite().language(), s)
+ return dict is not None
+
+
+def needcheck(pl):
+ if main:
+ if pl.namespace() != 0:
+ return False
+ if pl in checked:
+ return False
+ if skipdates:
+ if isdate(pl.title()):
+ return False
+ return True
+
+
+def include(pl, checklinks=True, realinclude=True, linkterm=None):
+ cl = checklinks
+ if linkterm:
+ actualworkingcat = catlib.Category(mysite, workingcat.title(),
+ sortKey=linkterm)
+ else:
+ actualworkingcat = workingcat
+ if realinclude:
+ try:
+ text = pl.get()
+ except pywikibot.NoPage:
+ pass
+ except pywikibot.IsRedirectPage:
+ cl = True
+ pass
+ else:
+ cats = [x for x in pl.categories()]
+ if not workingcat in cats:
+ cats = [x for x in pl.categories()]
+ for c in cats:
+ if c in parentcats:
+ if removeparent:
+ catlib.change_category(pl, c, actualworkingcat)
+ break
+ else:
+ pl.put(pywikibot.replaceCategoryLinks(
+ text, cats + [actualworkingcat]))
+ if cl:
+ if checkforward:
+ for page2 in pl.linkedPages():
+ if needcheck(page2):
+ tocheck.append(page2)
+ checked[page2] = page2
+ if checkbackward:
+ for refPage in pl.getReferences():
+ if needcheck(refPage):
+ tocheck.append(refPage)
+ checked[refPage] = refPage
+
+
+def exclude(pl, real_exclude=True):
+ if real_exclude:
+ excludefile.write('%s\n' % pl.title())
+
+
+def asktoadd(pl):
+ if pl.site != mysite:
+ return
+ if pl.isRedirectPage():
+ pl2 = pl.getRedirectTarget()
+ if needcheck(pl2):
+ tocheck.append(pl2)
+ checked[pl2] = pl2
+ return
+ ctoshow = 500
+ pywikibot.output(u'')
+ pywikibot.output(u"==%s==" % pl.title())
+ while 1:
+ answer = raw_input("y(es)/n(o)/i(gnore)/(o)ther options? ")
+ if answer == 'y':
+ include(pl)
+ break
+ if answer == 'c':
+ include(pl, realinclude=False)
+ break
+ if answer == 'z':
+ if pl.exists():
+ if not pl.isRedirectPage():
+ linkterm = pywikibot.input(
+ u"In what manner should it be alphabetized?")
+ include(pl, linkterm=linkterm)
+ break
+ include(pl)
+ break
+ elif answer == 'n':
+ exclude(pl)
+ break
+ elif answer == 'i':
+ exclude(pl, real_exclude=False)
+ break
+ elif answer == 'o':
+ pywikibot.output(u"t: Give the beginning of the text of the page")
+ pywikibot.output(
+ u"z: Add under another title (as [[Category|Title]])")
+ pywikibot.output(
+ u"x: Add the page, but do not check links to and from it")
+ pywikibot.output(u"c: Do not add the page, but do check links")
+ pywikibot.output(u"a: Add another page")
+ pywikibot.output(u"l: Give a list of the pages to check")
+ elif answer == 'a':
+ pagetitle = raw_input("Specify page to add:")
+ page = pywikibot.Page(pywikibot.getSite(), pagetitle)
+ if not page in checked.keys():
+ include(page)
+ elif answer == 'x':
+ if pl.exists():
+ if pl.isRedirectPage():
+ pywikibot.output(
+ u"Redirect page. Will be included normally.")
+ include(pl, realinclude=False)
+ else:
+ include(pl, checklinks=False)
+ else:
+ pywikibot.output(u"Page does not exist; not added.")
+ exclude(pl, real_exclude=False)
+ break
+ elif answer == 'l':
+ pywikibot.output(u"Number of pages still to check: %s"
+ % len(tocheck))
+ pywikibot.output(u"Pages to be checked:")
+ pywikibot.output(u" - ".join(page.title() for page in tocheck))
+ pywikibot.output(u"==%s==" % pl.title())
+ elif answer == 't':
+ pywikibot.output(u"==%s==" % pl.title())
+ try:
+ pywikibot.output(u'' + pl.get(get_redirect=True)[0:ctoshow])
+ except pywikibot.NoPage:
+ pywikibot.output(u"Page does not exist.")
+ ctoshow += 500
+ else:
+ pywikibot.output(u"Not understood.")
+
+try:
+ checked = {}
+ skipdates = False
+ checkforward = True
+ checkbackward = True
+ checkbroken = True
+ removeparent = True
+ main = True
+ workingcatname = []
+ tocheck = []
+ for arg in pywikibot.handleArgs():
+ if arg.startswith('-nodate'):
+ skipdates = True
+ elif arg.startswith('-forward'):
+ checkbackward = False
+ checkbroken = False
+ elif arg.startswith('-exist'):
+ checkbroken = False
+ elif arg.startswith('-keepparent'):
+ removeparent = False
+ elif arg.startswith('-all'):
+ main = False
+ else:
+ workingcatname.append(arg)
+
+ if len(workingcatname) == 0:
+ workingcatname = raw_input("Which page to start with? ")
+ else:
+ workingcatname = ' '.join(workingcatname)
+ mysite = pywikibot.getSite()
+ workingcatname = unicode(workingcatname, 'utf-8')
+ pywikibot.setAction(i18n.twtranslate(mysite, 'makecat-create') + u' ' + workingcatname)
+ workingcat = catlib.Category(mysite,
+ u'%s:%s'
+ % (mysite.category_namespace(),
+ workingcatname))
+ filename = pywikibot.config.datafilepath('category',
+ workingcatname.encode('ascii', 'xmlcharrefreplace') + '_exclude.txt')
+ try:
+ f = codecs.open(filename, 'r', encoding=mysite.encoding())
+ for line in f.readlines():
+ # remove trailing newlines and carriage returns
+ try:
+ while line[-1] in ['\n', '\r']:
+ line = line[:-1]
+ except IndexError:
+ pass
+ exclude(line, real_exclude=False)
+ pl = pywikibot.Page(mysite, line)
+ checked[pl] = pl
+ f.close()
+ excludefile = codecs.open(filename, 'a', encoding=mysite.encoding())
+ except IOError:
+ # File does not exist
+ excludefile = codecs.open(filename, 'w', encoding=mysite.encoding())
+ try:
+ parentcats = workingcat.categories()
+ except pywikibot.Error:
+ parentcats = []
+ # Do not include articles already in subcats; only checking direct subcats
+ subcatlist = list(workingcat.subcategories())
+ if subcatlist:
+ subcatlist = pagegenerators.PreloadingGenerator(subcatlist)
+ for cat in subcatlist:
+ artlist = list(cat.articles())
+ for page in artlist:
+ exclude(page.title(), real_exclude=False)
+ checked[page] = page
+ list = [x for x in workingcat.articles()]
+ if list:
+ for pl in list:
+ checked[pl] = pl
+ list = pagegenerators.PreloadingGenerator(list)
+ for pl in list:
+ include(pl)
+ else:
+ pywikibot.output(
+ u"Category %s does not exist or is empty. Which page to start with?"
+ % workingcatname)
+ answer = pywikibot.input(u"(Default is [[%s]]):" % workingcatname)
+ if not answer:
+ answer = workingcatname
+ pywikibot.output(u'' + answer)
+ pl = pywikibot.Page(mysite, answer)
+ tocheck = []
+ checked[pl] = pl
+ include(pl)
+ loaded = 0
+ while tocheck:
+ if loaded == 0:
+ if len(tocheck) < 50:
+ loaded = len(tocheck)
+ else:
+ loaded = 50
+ tocheck = [x for x in pagegenerators.PreloadingGenerator(tocheck[:loaded])]
+ if not checkbroken:
+ if not tocheck[0].exists():
+ pass
+ else:
+ asktoadd(tocheck[0])
+ else:
+ asktoadd(tocheck[0])
+ tocheck = tocheck[1:]
+ loaded -= 1
+
+finally:
+ pywikibot.stopme()
+ try:
+ excludefile.close()
+ except:
+ pass
--
To view, visit https://gerrit.wikimedia.org/r/102837
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Icb7c690bf3e8625ad4a10e0f3e9f7b523f54c059
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: M4tx <m4tx(a)m4tx.pl>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: M4tx <m4tx(a)m4tx.pl>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Strainu <wiki(a)strainu.ro>
Gerrit-Reviewer: jenkins-bot
Strainu has submitted this change and it was merged.
Change subject: Port weblinkchecker.py from pywikibot/compat
......................................................................
Port weblinkchecker.py from pywikibot/compat
Change-Id: I6df31a11e31e570128a6a16397395fc9b95729bc
---
A scripts/weblinkchecker.py
1 file changed, 870 insertions(+), 0 deletions(-)
Approvals:
Strainu: Verified; Looks good to me, approved
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
new file mode 100644
index 0000000..fe138c7
--- /dev/null
+++ b/scripts/weblinkchecker.py
@@ -0,0 +1,870 @@
+# -*- coding: utf-8 -*-
+"""
+This bot is used for checking external links found at the wiki. It checks
+several pages at once, with a limit set by the config variable
+max_external_links, which defaults to 50.
+
+The bot won't change any wiki pages, it will only report dead links such that
+people can fix or remove the links themselves.
+
+The bot will store all links found dead in a .dat file in the deadlinks
+subdirectory. To avoid the removing of links which are only temporarily
+unavailable, the bot ONLY reports links which were reported dead at least
+two times, with a time lag of at least one week. Such links will be logged to a
+.txt file in the deadlinks subdirectory.
+
+After running the bot and waiting for at least one week, you can re-check those
+pages where dead links were found, using the -repeat parameter.
+
+In addition to the logging step, it is possible to automatically report dead
+links to the talk page of the article where the link was found. To use this
+feature, set report_dead_links_on_talk = True in your user-config.py, or
+specify "-talk" on the command line. Adding "-notalk" switches this off
+irrespective of the configuration variable.
+
+When a link is found alive, it will be removed from the .dat file.
+
+These command line parameters can be used to specify which pages to work on:
+
+¶ms;
+
+-repeat Work on all pages were dead links were found before. This is
+ useful to confirm that the links are dead after some time (at
+ least one week), which is required before the script will report
+ the problem.
+
+-namespace Only process templates in the namespace with the given number or
+ name. This parameter may be used multiple times.
+
+-ignore HTTP return codes to ignore. Can be provided several times :
+ -ignore:401 -ignore:500
+
+Furthermore, the following command line parameters are supported:
+
+-talk Overrides the report_dead_links_on_talk config variable, enabling
+ the feature.
+
+-notalk Overrides the report_dead_links_on_talk config variable, disabling
+ the feature.
+-day the first time found dead link longer than x day ago, it should
+ probably be fixed or removed. if no set, default is 7 day.
+
+All other parameters will be regarded as part of the title of a single page,
+and the bot will only work on that single page.
+
+The following config variables are supported:
+
+max_external_links - The maximum number of web pages that should be
+ loaded simultaneously. You should change this
+ according to your Internet connection speed.
+ Be careful: if it is set too high, the script
+ might get socket errors because your network
+ is congested, and will then think that the page
+ is offline.
+
+report_dead_links_on_talk - If set to true, causes the script to report dead
+ links on the article's talk page if (and ONLY if)
+ the linked page has been unavailable at least two
+ times during a timespan of at least one week.
+
+Syntax examples:
+ python weblinkchecker.py -start:!
+ Loads all wiki pages in alphabetical order using the Special:Allpages
+ feature.
+
+ python weblinkchecker.py -start:Example_page
+ Loads all wiki pages using the Special:Allpages feature, starting at
+ "Example page"
+
+ python weblinkchecker.py -weblink:www.example.org
+ Loads all wiki pages that link to www.example.org
+
+ python weblinkchecker.py Example page
+ Only checks links found in the wiki page "Example page"
+
+ python weblinkchecker.py -repeat
+ Loads all wiki pages where dead links were found during a prior run
+"""
+
+#
+# (C) Daniel Herding, 2005
+# (C) Pywikibot team, 2005-2013
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import sys
+import re
+import codecs
+import pickle
+import httplib
+import socket
+import urlparse
+import urllib
+import urllib2
+import threading
+import time
+import gzip
+import StringIO
+
+import pywikibot
+from pywikibot import i18n
+from pywikibot import config
+from pywikibot import pagegenerators
+
+docuReplacements = {
+ '¶ms;': pagegenerators.parameterHelp
+}
+
+ignorelist = [
+ # Officialy reserved for testing, documentation, etc. in
+ # http://tools.ietf.org/html/rfc2606#page-2
+ # top-level domains:
+ re.compile('.*[\./(a)]test(/.*)?'),
+ re.compile('.*[\./(a)]example(/.*)?'),
+ re.compile('.*[\./(a)]invalid(/.*)?'),
+ re.compile('.*[\./(a)]localhost(/.*)?'),
+ # second-level domains:
+ re.compile('.*[\./(a)]example\.com(/.*)?'),
+ re.compile('.*[\./(a)]example\.net(/.*)?'),
+ re.compile('.*[\./(a)]example\.org(/.*)?'),
+
+ # Other special cases
+ re.compile('.*[\./(a)]gso\.gbv\.de(/.*)?'), # bot somehow can't handle their redirects
+ re.compile('.*[\./(a)]berlinonline\.de(/.*)?'), # a de: user wants to fix them by hand and doesn't want them to be deleted, see [[de:Benutzer:BLueFiSH.as/BZ]].
+ re.compile('.*[\./(a)]bodo\.kommune\.no(/.*)?'), # bot can't handle their redirects
+ re.compile('.*[\./(a)]jpl\.nasa\.gov(/.*)?'), # bot rejected on the site
+ re.compile('.*[\./(a)]itis\.gov(/.*)?'), # bot rejected on the site
+ re.compile('.*[\./(a)]cev\.lu(/.*)?'), # bot rejected on the site
+ re.compile('.*[\./(a)]science\.ksc\.nasa\.gov(/.*)?'), # very slow response resulting in bot error
+ re.compile('.*[\./(a)]britannica\.com(/.*)?'), # HTTP redirect loop
+ re.compile('.*[\./(a)]quickfacts\.census\.gov(/.*)?'), # bot rejected on the site
+]
+
+
+def weblinksIn(text, withoutBracketed=False, onlyBracketed=False):
+ text = pywikibot.removeDisabledParts(text)
+
+ # MediaWiki parses templates before parsing external links. Thus, there
+ # might be a | or a } directly after a URL which does not belong to
+ # the URL itself.
+
+ # First, remove the curly braces of inner templates:
+ nestedTemplateR = re.compile(r'{{([^}]*?){{(.*?)}}(.*?)}}')
+ while nestedTemplateR.search(text):
+ text = nestedTemplateR.sub(r'{{\1 \2 \3}}', text)
+
+ # Then blow up the templates with spaces so that the | and }} will not
+ # be regarded as part of the link:.
+ templateWithParamsR = re.compile(r'{{([^}]*?[^ ])\|([^ ][^}]*?)}}',
+ re.DOTALL)
+ while templateWithParamsR.search(text):
+ text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
+
+ # Add <blank> at the end of a template
+ # URL as last param of multiline template would not be correct
+ text = text.replace('}}', ' }}')
+
+ # Remove HTML comments in URLs as well as URLs in HTML comments.
+ # Also remove text inside nowiki links etc.
+ text = pywikibot.removeDisabledParts(text)
+ linkR = pywikibot.compileLinkR(withoutBracketed, onlyBracketed)
+ for m in linkR.finditer(text):
+ if m.group('url'):
+ yield m.group('url')
+ else:
+ yield m.group('urlb')
+
+
+class InternetArchiveConsulter:
+ def __init__(self, url):
+ self.url = url
+
+ def getArchiveURL(self):
+ pywikibot.output(u'Consulting the Internet Archive for %s' % self.url)
+ archiveURL = 'http://web.archive.org/web/*/%s' % self.url
+ try:
+ f = urllib2.urlopen(archiveURL)
+ except urllib2.HTTPError:
+ # The Internet Archive yields a 403 error when the site was not
+ # archived due to robots.txt restrictions.
+ return
+ except UnicodeEncodeError:
+ return
+ data = f.read()
+ if f.headers.get('content-encoding', None) == 'gzip':
+ # Since 2008, the Internet Archive returns pages in GZIPed
+ # compression format. Unfortunatelly urllib2 doesn't handle
+ # the decompression for us, so we have to do it ourselves.
+ data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
+ if "Search Results for " in data:
+ return archiveURL
+
+
+class LinkChecker(object):
+ """
+ Given a HTTP URL, tries to load the page from the Internet and checks if it
+ is still online.
+
+ Returns a (boolean, string) tuple saying if the page is online and including
+ a status reason.
+
+ Warning: Also returns false if your Internet connection isn't working
+ correctly! (This will give a Socket Error)
+
+ """
+ def __init__(self, url, redirectChain=[], serverEncoding=None,
+ HTTPignore=[]):
+ """
+ redirectChain is a list of redirects which were resolved by
+ resolveRedirect(). This is needed to detect redirect loops.
+ """
+ self.url = url
+ self.serverEncoding = serverEncoding
+ self.header = {
+ # 'User-agent': pywikibot.useragent,
+ # we fake being Firefox because some webservers block unknown
+ # clients, e.g. http://images.google.de/images?q=Albit gives a 403
+ # when using the PyWikipediaBot user agent.
+ 'User-agent': 'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1 Firefox/1.5',
+ 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
+ 'Accept-Language': 'de-de,de;q=0.8,en-us;q=0.5,en;q=0.3',
+ 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
+ 'Keep-Alive': '30',
+ 'Connection': 'keep-alive',
+ }
+ self.redirectChain = redirectChain + [url]
+ self.changeUrl(url)
+ self.HTTPignore = HTTPignore
+
+ def getConnection(self):
+ if self.scheme == 'http':
+ return httplib.HTTPConnection(self.host)
+ elif self.scheme == 'https':
+ return httplib.HTTPSConnection(self.host)
+
+ def getEncodingUsedByServer(self):
+ if not self.serverEncoding:
+ try:
+ pywikibot.output(
+ u'Contacting server %s to find out its default encoding...'
+ % self.host)
+ conn = self.getConnection()
+ conn.request('HEAD', '/', None, self.header)
+ self.response = conn.getresponse()
+
+ self.readEncodingFromResponse(response)
+ except:
+ pass
+ if not self.serverEncoding:
+ # TODO: We might also load a page, then check for an encoding
+ # definition in a HTML meta tag.
+ pywikibot.output(u'Error retrieving server\'s default charset. '
+ u'Using ISO 8859-1.')
+ # most browsers use ISO 8859-1 (Latin-1) as the default.
+ self.serverEncoding = 'iso8859-1'
+ return self.serverEncoding
+
+ def readEncodingFromResponse(self, response):
+ if not self.serverEncoding:
+ try:
+ ct = response.getheader('Content-Type')
+ charsetR = re.compile('charset=(.+)')
+ charset = charsetR.search(ct).group(1)
+ self.serverEncoding = charset
+ except:
+ pass
+
+ def changeUrl(self, url):
+ self.url = url
+ # we ignore the fragment
+ (self.scheme, self.host, self.path, self.query,
+ self.fragment) = urlparse.urlsplit(self.url)
+ if not self.path:
+ self.path = '/'
+ if self.query:
+ self.query = '?' + self.query
+ self.protocol = url.split(':', 1)[0]
+ # check if there are non-ASCII characters inside path or query, and if
+ # so, encode them in an encoding that hopefully is the right one.
+ try:
+ self.path.encode('ascii')
+ self.query.encode('ascii')
+ except UnicodeEncodeError:
+ encoding = self.getEncodingUsedByServer()
+ self.path = unicode(urllib.quote(self.path.encode(encoding)))
+ self.query = unicode(urllib.quote(self.query.encode(encoding), '=&'))
+
+ def resolveRedirect(self, useHEAD=False):
+ """
+ Requests the header from the server. If the page is an HTTP redirect,
+ returns the redirect target URL as a string. Otherwise returns None.
+
+ If useHEAD is true, uses the HTTP HEAD method, which saves bandwidth
+ by not downloading the body. Otherwise, the HTTP GET method is used.
+
+ """
+ conn = self.getConnection()
+ try:
+ if useHEAD:
+ conn.request('HEAD', '%s%s' % (self.path, self.query), None,
+ self.header)
+ else:
+ conn.request('GET', '%s%s' % (self.path, self.query), None,
+ self.header)
+ self.response = conn.getresponse()
+ # read the server's encoding, in case we need it later
+ self.readEncodingFromResponse(self.response)
+ except httplib.BadStatusLine:
+ # Some servers don't seem to handle HEAD requests properly,
+ # e.g. http://www.radiorus.ru/ which is running on a very old
+ # Apache server. Using GET instead works on these (but it uses
+ # more bandwidth).
+ if useHEAD:
+ return self.resolveRedirect(useHEAD=False)
+ else:
+ raise
+ if self.response.status >= 300 and self.response.status <= 399:
+ #print response.getheaders()
+ redirTarget = self.response.getheader('Location')
+ if redirTarget:
+ try:
+ redirTarget.encode('ascii')
+ except UnicodeError:
+ redirTarget = redirTarget.decode(
+ self.getEncodingUsedByServer())
+ if redirTarget.startswith('http://') or \
+ redirTarget.startswith('https://'):
+ self.changeUrl(redirTarget)
+ return True
+ elif redirTarget.startswith('/'):
+ self.changeUrl(u'%s://%s%s'
+ % (self.protocol, self.host, redirTarget))
+ return True
+ else: # redirect to relative position
+ # cut off filename
+ directory = self.path[:self.path.rindex('/') + 1]
+ # handle redirect to parent directory
+ while redirTarget.startswith('../'):
+ redirTarget = redirTarget[3:]
+ # some servers redirect to .. although we are already
+ # in the root directory; ignore this.
+ if directory != '/':
+ # change /foo/bar/ to /foo/
+ directory = directory[:-1]
+ directory = directory[:directory.rindex('/') + 1]
+ self.changeUrl('%s://%s%s%s'
+ % (self.protocol, self.host, directory,
+ redirTarget))
+ return True
+ else:
+ return False # not a redirect
+
+ def check(self, useHEAD=False):
+ """
+ Returns True and the server status message if the page is alive.
+ Otherwise returns false
+ """
+ try:
+ wasRedirected = self.resolveRedirect(useHEAD=useHEAD)
+ except UnicodeError, error:
+ return False, u'Encoding Error: %s (%s)' % (
+ error.__class__.__name__, unicode(error))
+ except httplib.error, error:
+ return False, u'HTTP Error: %s' % error.__class__.__name__
+ except socket.error, error:
+ # http://docs.python.org/lib/module-socket.html :
+ # socket.error :
+ # The accompanying value is either a string telling what went
+ # wrong or a pair (errno, string) representing an error
+ # returned by a system call, similar to the value
+ # accompanying os.error
+ if isinstance(error, basestring):
+ msg = error
+ else:
+ try:
+ msg = error[1]
+ except IndexError:
+ print u'### DEBUG information for #2972249'
+ raise IndexError(type(error))
+ # TODO: decode msg. On Linux, it's encoded in UTF-8.
+ # How is it encoded in Windows? Or can we somehow just
+ # get the English message?
+ return False, u'Socket Error: %s' % repr(msg)
+ if wasRedirected:
+ if self.url in self.redirectChain:
+ if useHEAD:
+ # Some servers don't seem to handle HEAD requests properly,
+ # which leads to a cyclic list of redirects.
+ # We simply start from the beginning, but this time,
+ # we don't use HEAD, but GET requests.
+ redirChecker = LinkChecker(
+ self.redirectChain[0],
+ serverEncoding=self.serverEncoding,
+ HTTPignore=self.HTTPignore)
+ return redirChecker.check(useHEAD=False)
+ else:
+ urlList = ['[%s]' % url
+ for url in self.redirectChain + [self.url]]
+ return (False,
+ u'HTTP Redirect Loop: %s' % ' -> '.join(urlList))
+ elif len(self.redirectChain) >= 19:
+ if useHEAD:
+ # Some servers don't seem to handle HEAD requests properly,
+ # which leads to a long (or infinite) list of redirects.
+ # We simply start from the beginning, but this time,
+ # we don't use HEAD, but GET requests.
+ redirChecker = LinkChecker(
+ self.redirectChain[0],
+ serverEncoding=self.serverEncoding,
+ HTTPignore=self.HTTPignore)
+ return redirChecker.check(useHEAD=False)
+ else:
+ urlList = ['[%s]' % url
+ for url in self.redirectChain + [self.url]]
+ return (False,
+ u'Long Chain of Redirects: %s'
+ % ' -> '.join(urlList))
+ else:
+ redirChecker = LinkChecker(self.url, self.redirectChain,
+ self.serverEncoding,
+ HTTPignore=self.HTTPignore)
+ return redirChecker.check(useHEAD=useHEAD)
+ else:
+ try:
+ conn = self.getConnection()
+ except httplib.error, error:
+ return False, u'HTTP Error: %s' % error.__class__.__name__
+ try:
+ conn.request('GET', '%s%s'
+ % (self.path, self.query), None, self.header)
+ except socket.error, error:
+ return False, u'Socket Error: %s' % repr(error[1])
+ try:
+ self.response = conn.getresponse()
+ except Exception, error:
+ return False, u'Error: %s' % error
+ # read the server's encoding, in case we need it later
+ self.readEncodingFromResponse(self.response)
+ # site down if the server status is between 400 and 499
+ alive = self.response.status not in range(400, 500)
+ if self.response.status in self.HTTPignore:
+ alive = False
+ return alive, '%s %s' % (self.response.status, self.response.reason)
+
+
+class LinkCheckThread(threading.Thread):
+ """ A thread responsible for checking one URL. After checking the page, it
+ will die.
+
+ """
+ def __init__(self, page, url, history, HTTPignore):
+ threading.Thread.__init__(self)
+ self.page = page
+ self.url = url
+ self.history = history
+ # identification for debugging purposes
+ self.setName((u'%s - %s' % (page.title(), url)).encode('utf-8',
+ 'replace'))
+ self.HTTPignore = HTTPignore
+
+ def run(self):
+ linkChecker = LinkChecker(self.url, HTTPignore=self.HTTPignore)
+ try:
+ ok, message = linkChecker.check()
+ except:
+ pywikibot.output('Exception while processing URL %s in page %s'
+ % (self.url, self.page.title()))
+ raise
+ if ok:
+ if self.history.setLinkAlive(self.url):
+ pywikibot.output('*Link to %s in [[%s]] is back alive.'
+ % (self.url, self.page.title()))
+ else:
+ pywikibot.output('*[[%s]] links to %s - %s.'
+ % (self.page.title(), self.url, message))
+ self.history.setLinkDead(self.url, message, self.page, day)
+
+
+class History:
+ """ Stores previously found dead links. The URLs are dictionary keys, and
+ values are lists of tuples where each tuple represents one time the URL was
+ found dead. Tuples have the form (title, date, error) where title is the
+ wiki page where the URL was found, date is an instance of time, and error is
+ a string with error code and message.
+
+ We assume that the first element in the list represents the first time we
+ found this dead link, and the last element represents the last time.
+
+ Example:
+
+ dict = {
+ 'http://www.example.org/page': [
+ ('WikiPageTitle', DATE, '404: File not found'),
+ ('WikiPageName2', DATE, '404: File not found'),
+ ]
+
+ """
+
+ def __init__(self, reportThread):
+ self.reportThread = reportThread
+ site = pywikibot.getSite()
+ self.semaphore = threading.Semaphore()
+ self.datfilename = pywikibot.config.datafilepath(
+ 'deadlinks', 'deadlinks-%s-%s.dat' % (site.family.name, site.code))
+ # Count the number of logged links, so that we can insert captions
+ # from time to time
+ self.logCount = 0
+ try:
+ datfile = open(self.datfilename, 'r')
+ self.historyDict = pickle.load(datfile)
+ datfile.close()
+ except (IOError, EOFError):
+ # no saved history exists yet, or history dump broken
+ self.historyDict = {}
+
+ def log(self, url, error, containingPage, archiveURL):
+ """
+ Logs an error report to a text file in the deadlinks subdirectory.
+ """
+ site = pywikibot.getSite()
+ if archiveURL:
+ errorReport = u'* %s ([%s archive])\n' % (url, archiveURL)
+ else:
+ errorReport = u'* %s\n' % url
+ for (pageTitle, date, error) in self.historyDict[url]:
+ # ISO 8601 formulation
+ isoDate = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(date))
+ errorReport += "** In [[%s]] on %s, %s\n" % (pageTitle, isoDate,
+ error)
+ pywikibot.output(u"** Logging link for deletion.")
+ txtfilename = pywikibot.config.datafilepath('deadlinks',
+ 'results-%s-%s.txt'
+ % (site.family.name,
+ site.lang))
+ txtfile = codecs.open(txtfilename, 'a', 'utf-8')
+ self.logCount += 1
+ if self.logCount % 30 == 0:
+ # insert a caption
+ txtfile.write('=== %s ===\n' % containingPage.title()[:3])
+ txtfile.write(errorReport)
+ txtfile.close()
+
+ if self.reportThread and not containingPage.isTalkPage():
+ self.reportThread.report(url, errorReport, containingPage,
+ archiveURL)
+
+ def setLinkDead(self, url, error, page, day):
+ """
+ Adds the fact that the link was found dead to the .dat file.
+ """
+ self.semaphore.acquire()
+ now = time.time()
+ if url in self.historyDict:
+ timeSinceFirstFound = now - self.historyDict[url][0][1]
+ timeSinceLastFound = now - self.historyDict[url][-1][1]
+ # if the last time we found this dead link is less than an hour
+ # ago, we won't save it in the history this time.
+ if timeSinceLastFound > 60 * 60:
+ self.historyDict[url].append((page.title(), now, error))
+ # if the first time we found this link longer than x day ago
+ # (default is a week), it should probably be fixed or removed.
+ # We'll list it in a file so that it can be removed manually.
+ if timeSinceFirstFound > 60 * 60 * 24 * day:
+ # search for archived page
+ iac = InternetArchiveConsulter(url)
+ archiveURL = iac.getArchiveURL()
+ self.log(url, error, page, archiveURL)
+ else:
+ self.historyDict[url] = [(page.title(), now, error)]
+ self.semaphore.release()
+
+ def setLinkAlive(self, url):
+ """
+ If the link was previously found dead, removes it from the .dat file
+ and returns True, else returns False.
+ """
+ if url in self.historyDict:
+ self.semaphore.acquire()
+ try:
+ del self.historyDict[url]
+ except KeyError:
+ # Not sure why this can happen, but I guess we can ignore this.
+ pass
+ self.semaphore.release()
+ return True
+ else:
+ return False
+
+ def save(self):
+ """
+ Saves the .dat file to disk.
+ """
+ datfile = open(self.datfilename, 'w')
+ pickle.dump(self.historyDict, datfile)
+ datfile.close()
+
+
+class DeadLinkReportThread(threading.Thread):
+ '''
+ A Thread that is responsible for posting error reports on talk pages. There
+ will only be one DeadLinkReportThread, and it is using a semaphore to make
+ sure that two LinkCheckerThreads can not access the queue at the same time.
+ '''
+ def __init__(self):
+ threading.Thread.__init__(self)
+ self.semaphore = threading.Semaphore()
+ self.queue = []
+ self.finishing = False
+ self.killed = False
+
+ def report(self, url, errorReport, containingPage, archiveURL):
+ """ Tries to add an error report to the talk page belonging to the page
+ containing the dead link.
+
+ """
+ self.semaphore.acquire()
+ self.queue.append((url, errorReport, containingPage, archiveURL))
+ self.semaphore.release()
+
+ def shutdown(self):
+ self.finishing = True
+
+ def kill(self):
+ # TODO: remove if unneeded
+ self.killed = True
+
+ def run(self):
+ while not self.killed:
+ if len(self.queue) == 0:
+ if self.finishing:
+ break
+ else:
+ time.sleep(0.1)
+ else:
+ self.semaphore.acquire()
+ (url, errorReport, containingPage, archiveURL) = self.queue[0]
+ self.queue = self.queue[1:]
+ talkPage = containingPage.toggleTalkPage()
+ pywikibot.output(
+ u'\03{lightaqua}** Reporting dead link on %s...\03{default}'
+ % talkPage.title(asLink=True))
+ try:
+ content = talkPage.get() + "\n\n"
+ if url in content:
+ pywikibot.output(
+ u'\03{lightaqua}** Dead link seems to have already '
+ u'been reported on %s\03{default}'
+ % talkPage.title(asLink=True))
+ self.semaphore.release()
+ continue
+ except (pywikibot.NoPage, pywikibot.IsRedirectPage):
+ content = u''
+
+ if archiveURL:
+ archiveMsg = u'\n' + \
+ i18n.twtranslate(pywikibot.getSite(),
+ 'weblinkchecker-archive_msg',
+ {'URL': archiveURL})
+ else:
+ archiveMsg = u''
+ # The caption will default to "Dead link". But if there is
+ # already such a caption, we'll use "Dead link 2",
+ # "Dead link 3", etc.
+ caption = i18n.twtranslate(pywikibot.getSite(),
+ 'weblinkchecker-caption')
+ i = 1
+ count = u''
+ # Check if there is already such a caption on the talk page.
+ while re.search('= *%s%s *=' % (caption, count),
+ content) is not None:
+ i += 1
+ count = u' ' + str(i)
+ caption += count
+ content += '\n\n== %s ==\n\n%s\n\n%s%s--~~~~' % \
+ (caption,
+ i18n.twtranslate(pywikibot.getSite(),
+ 'weblinkchecker-report'),
+ errorReport,
+ archiveMsg)
+ comment = u'[[%s#%s|→]] %s' % \
+ (talkPage.title(), caption,
+ i18n.twtranslate(pywikibot.getSite(),
+ 'weblinkchecker-summary'))
+ try:
+ talkPage.put(content, comment)
+ except pywikibot.SpamfilterError, error:
+ pywikibot.output(
+ u'\03{lightaqua}** SpamfilterError while trying to '
+ u'change %s: %s\03{default}'
+ % (talkPage.title(asLink=True), error.url))
+
+ self.semaphore.release()
+
+
+class WeblinkCheckerRobot:
+ """
+ Robot which will use several LinkCheckThreads at once to search for dead
+ weblinks on pages provided by the given generator.
+
+ """
+ def __init__(self, generator, HTTPignore=[]):
+ self.generator = generator
+ if config.report_dead_links_on_talk:
+ #pywikibot.output("Starting talk page thread")
+ reportThread = DeadLinkReportThread()
+ # thread dies when program terminates
+ # reportThread.setDaemon(True)
+ reportThread.start()
+ else:
+ reportThread = None
+ self.history = History(reportThread)
+ self.HTTPignore = HTTPignore
+
+ def run(self):
+ for page in self.generator:
+ self.checkLinksIn(page)
+
+ def checkLinksIn(self, page):
+ try:
+ text = page.get()
+ except pywikibot.NoPage:
+ pywikibot.output(u'%s does not exist.' % page.title())
+ return
+ for url in weblinksIn(text):
+ ignoreUrl = False
+ for ignoreR in ignorelist:
+ if ignoreR.match(url):
+ ignoreUrl = True
+ if not ignoreUrl:
+ # Limit the number of threads started at the same time. Each
+ # thread will check one page, then die.
+ while threading.activeCount() >= config.max_external_links:
+ # wait 100 ms
+ time.sleep(0.1)
+ thread = LinkCheckThread(page, url, self.history,
+ self.HTTPignore)
+ # thread dies when program terminates
+ thread.setDaemon(True)
+ thread.start()
+
+
+def RepeatPageGenerator():
+ history = History(None)
+ pageTitles = set()
+ for (key, value) in history.historyDict.iteritems():
+ for entry in value:
+ pageTitle = entry[0]
+ pageTitles.add(pageTitle)
+ pageTitles = list(pageTitles)
+ pageTitles.sort()
+ for pageTitle in pageTitles:
+ page = pywikibot.Page(pywikibot.getSite(), pageTitle)
+ yield page
+
+
+def countLinkCheckThreads():
+ i = 0
+ for thread in threading.enumerate():
+ if isinstance(thread, LinkCheckThread):
+ i += 1
+ return i
+
+
+def check(url):
+ """Peform a check on URL"""
+ c = LinkChecker(url)
+ return c.check()
+
+
+def main():
+ gen = None
+ singlePageTitle = []
+ # Which namespaces should be processed?
+ # default to [] which means all namespaces will be processed
+ namespaces = []
+ HTTPignore = []
+ # This factory is responsible for processing command line arguments
+ # that are also used by other scripts and that determine on which pages
+ # to work on.
+ genFactory = pagegenerators.GeneratorFactory()
+ global day
+ day = 7
+ for arg in pywikibot.handleArgs():
+ if arg == '-talk':
+ config.report_dead_links_on_talk = True
+ elif arg == '-notalk':
+ config.report_dead_links_on_talk = False
+ elif arg.startswith('-namespace:'):
+ try:
+ namespaces.append(int(arg[11:]))
+ except ValueError:
+ namespaces.append(arg[11:])
+ elif arg == '-repeat':
+ gen = RepeatPageGenerator()
+ elif arg.startswith('-ignore:'):
+ HTTPignore.append(int(arg[8:]))
+ elif arg.startswith('-day:'):
+ day = int(arg[5:])
+ else:
+ if not genFactory.handleArg(arg):
+ singlePageTitle.append(arg)
+
+ if singlePageTitle:
+ singlePageTitle = ' '.join(singlePageTitle)
+ page = pywikibot.Page(pywikibot.getSite(), singlePageTitle)
+ gen = iter([page])
+
+ if not gen:
+ gen = genFactory.getCombinedGenerator()
+ if gen:
+ if namespaces != []:
+ gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
+ # fetch at least 240 pages simultaneously from the wiki, but more if
+ # a high thread number is set.
+ pageNumber = max(240, config.max_external_links * 2)
+ gen = pagegenerators.PreloadingGenerator(gen, pageNumber=pageNumber)
+ gen = pagegenerators.RedirectFilterPageGenerator(gen)
+ bot = WeblinkCheckerRobot(gen, HTTPignore)
+ try:
+ bot.run()
+ finally:
+ waitTime = 0
+ # Don't wait longer than 30 seconds for threads to finish.
+ while countLinkCheckThreads() > 0 and waitTime < 30:
+ try:
+ pywikibot.output(u"Waiting for remaining %i threads to "
+ u"finish, please wait..."
+ % countLinkCheckThreads())
+ # wait 1 second
+ time.sleep(1)
+ waitTime += 1
+ except KeyboardInterrupt:
+ pywikibot.output(u'Interrupted.')
+ break
+ if countLinkCheckThreads() > 0:
+ pywikibot.output(u'Remaining %i threads will be killed.'
+ % countLinkCheckThreads())
+ # Threads will die automatically because they are daemonic.
+ if bot.history.reportThread:
+ bot.history.reportThread.shutdown()
+ # wait until the report thread is shut down; the user can
+ # interrupt it by pressing CTRL-C.
+ try:
+ while bot.history.reportThread.isAlive():
+ time.sleep(0.1)
+ except KeyboardInterrupt:
+ pywikibot.output(u'Report thread interrupted.')
+ bot.history.reportThread.kill()
+ pywikibot.output(u'Saving history...')
+ bot.history.save()
+ else:
+ pywikibot.showHelp()
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ pywikibot.stopme()
--
To view, visit https://gerrit.wikimedia.org/r/102692
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I6df31a11e31e570128a6a16397395fc9b95729bc
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Gerrit Patch Uploader <gerritpatchuploader(a)gmail.com>
Gerrit-Reviewer: Gerrit Patch Uploader <gerritpatchuploader(a)gmail.com>
Gerrit-Reviewer: Kyle <brownkylej(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Strainu <wiki(a)strainu.ro>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged.
Change subject: [PEP8] changes
......................................................................
[PEP8] changes
Change-Id: I7e5b91177806e19100e8e140f4e8674397b9a522
---
M rciw.py
M rcsort.py
M replace.py
M replicate_wiki.py
M revertbot.py
5 files changed, 137 insertions(+), 122 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/rciw.py b/rciw.py
index 1bdfa9f..38d01ae 100644
--- a/rciw.py
+++ b/rciw.py
@@ -21,7 +21,7 @@
# http://hu.wikipedia.org/wiki/User:Kisbes
# License : GFDL
#
-# (C) Pywikipedia bot team, 2008, 2010
+# (C) Pywikibot team, 2008-2013
#
# Distributed under the terms of the MIT license.
#
@@ -35,8 +35,10 @@
import wikipedia as pywikibot
import interwiki
+
class IWRCBot():
- def __init__(self, site, safe = True):
+
+ def __init__(self, site, safe=True):
self.other_ns = re.compile(u'14\[\[07(' + u'|'.join(site.namespaces()) + u')')
interwiki.globalvar.autonomous = True
self.site = site
@@ -69,8 +71,11 @@
# it is a simple atomic append(), no need to acquire a semaphore
self.queue.put_nowait(page)
+
def main():
- pywikibot.warning('this script can not be run manually/directly, but automatically by maintainer.py')
+ pywikibot.warning('this script can not be run manually/directly, but '
+ 'automatically by maintainer.py')
+
if __name__ == "__main__":
main()
diff --git a/rcsort.py b/rcsort.py
index 70f6087..f338404 100644
--- a/rcsort.py
+++ b/rcsort.py
@@ -9,7 +9,7 @@
usable.
Permission has been asked to run this on the toolserver.
"""
-# (C) Pywikipedia bot team, 2007-2012
+# (C) Pywikibot team, 2007-2013
#
# Distributed under the terms of the MIT license.
#
@@ -19,7 +19,7 @@
import cgi
import cgitb
import re
-import wikipedia as pywikibot
+import pywikibot
cgitb.enable()
@@ -69,17 +69,18 @@
except AttributeError:
user = None
count += 1
- lines.append((user,count,line))
+ lines.append((user, count, line))
elif 'rcoptions' in line:
- print line.replace(mysite.path() + "?title=Speciaal:RecenteWijzigingen&",
+ print line.replace(mysite.path() +
+ "?title=Speciaal:RecenteWijzigingen&",
"rcsort.py?")
rcoptions = True
elif newbies and 'Nieuwste' in line:
- line = line.replace(mysite.path() + "?title=Speciaal:Bijdragen&",
- "rcsort.py?").replace("target=newbies",
- "newbies=true")
+ line = line.replace(mysite.path() + "?title=Speciaal:Bijdragen&",
+ "rcsort.py?").replace("target=newbies",
+ "newbies=true")
if '</fieldset>' in line:
- line = line[line.find('</fieldset>')+11:]
+ line = line[line.find('</fieldset>') + 11:]
print line
rcoptions = True
lines.sort()
@@ -88,13 +89,15 @@
for line in lines:
if line[0] != last:
print "</ul>"
- if line[0] == None:
+ if line[0] is None:
print "<h2>Gebruiker onbekend</h2>"
else:
- pywikibot.output(u"<h2>%s</h2>"%line[0],toStdout=True)
+ pywikibot.output(u"<h2>%s</h2>" % line[0], toStdout=True)
print "<ul>"
last = line[0]
- pywikibot.output(line[2].replace('href="/w','href="http://nl.wikipedia.org/w'), toStdout = True)
+ pywikibot.output(line[2].replace('href="/w',
+ 'href="http://nl.wikipedia.org/w'),
+ toStdout=True)
print
print "</ul>"
diff --git a/replace.py b/replace.py
index b347062..76adcfa 100644
--- a/replace.py
+++ b/replace.py
@@ -160,15 +160,19 @@
Please type "replace.py -help | more" if you can't read the top of the help.
"""
#
-# (C) Daniel Herding & the Pywikipedia team, 2004-2012
+# (c) Daniel Herding, 2004-2007
+# (c) Pywikibot team, 2004-2013
#
-__version__='$Id$'
+__version__ = '$Id$'
#
# Distributed under the terms of the MIT license.
#
-import sys, re, time, codecs
-import wikipedia as pywikibot
+import sys
+import re
+import time
+import codecs
+import pywikibot
import pagegenerators
import editarticle
from pywikibot import i18n
@@ -249,7 +253,7 @@
return True
if "require-title" in self.exceptions:
for req in self.exceptions['require-title']:
- if not req.search(title): # if not all requirements are met:
+ if not req.search(title): # if not all requirements are met:
return True
return False
@@ -263,9 +267,8 @@
class ReplaceRobot:
- """
- A bot that can do text replacements.
- """
+ """ A bot that can do text replacements. """
+
def __init__(self, generator, replacements, exceptions={},
acceptall=False, allowoverlap=False, recursive=False,
addedCat=None, sleep=None, editSummary='', articles=None,
@@ -419,8 +422,8 @@
if self.editcounter % 100:
return ''
else:
- return (u'<!-- ***** %dth title is above this line. ***** -->\n' %
- self.editcounter)
+ return (u'<!-- ***** %dth title is above this line. ***** -->\n'
+ % self.editcounter)
def run(self):
"""
@@ -447,8 +450,8 @@
new_text = original_text
while True:
if self.isTextExcepted(new_text):
- pywikibot.output(
- u'Skipping %s because it contains text that is on the exceptions list.'
+ pywikibot.output(u'Skipping %s because it contains text '
+ u'that is on the exceptions list.'
% page.title(asLink=True))
break
new_text = self.doReplacements(new_text)
@@ -458,7 +461,7 @@
break
if self.recursive:
newest_text = self.doReplacements(new_text)
- while (newest_text!=new_text):
+ while newest_text != new_text:
new_text = newest_text
newest_text = self.doReplacements(new_text)
if hasattr(self, "addedCat"):
@@ -476,16 +479,15 @@
break
if self.exctitles:
choice = pywikibot.inputChoice(
- u'Do you want to accept these changes?',
- ['Yes', 'No', 'no+eXcept', 'Edit',
- 'open in Browser', 'All', 'Quit'],
- ['y', 'N', 'x', 'e', 'b', 'a', 'q'], 'N')
+ u'Do you want to accept these changes?',
+ ['Yes', 'No', 'no+eXcept', 'Edit',
+ 'open in Browser', 'All', 'Quit'],
+ ['y', 'N', 'x', 'e', 'b', 'a', 'q'], 'N')
else:
choice = pywikibot.inputChoice(
- u'Do you want to accept these changes?',
- ['Yes', 'No', 'Edit', 'open in Browser', 'All',
- 'Quit'],
- ['y', 'N', 'e', 'b', 'a', 'q'], 'N')
+ u'Do you want to accept these changes?',
+ ['Yes', 'No', 'Edit', 'open in Browser', 'All', 'Quit'],
+ ['y', 'N', 'e', 'b', 'a', 'q'], 'N')
if choice == 'e':
editor = editarticle.TextEditor()
as_edited = editor.edit(original_text)
@@ -513,7 +515,7 @@
return
if choice == 'a':
self.acceptall = True
- if choice == 'x': #May happen only if self.exctitles isn't None
+ if choice == 'x': # May happen only if self.exctitles isn't None
self.exctitles.write(
u"ur'^%s$',\n" % re.escape(page.title()))
self.exctitles.flush()
@@ -527,12 +529,13 @@
# This is separately in two clauses of if for
# future purposes to get feedback form put_async
else:
- #Save the title for later processing instead of editing
+ # Save the title for later processing instead of editing
self.editcounter += 1
self.articles.write(u'#%s\n%s'
- % (page.title(asLink=True, textlink=True),
- self.splitLine()))
- self.articles.flush() # For the peace of our soul :-)
+ % (page.title(asLink=True,
+ textlink=True),
+ self.splitLine()))
+ self.articles.flush() # For the peace of our soul :-)
# choice must be 'N'
break
if self.acceptall and new_text != original_text:
@@ -540,7 +543,7 @@
#Primary behaviour: working on wiki
try:
page.put(new_text, self.editSummary)
- self.editcounter += 1 #increment only on success
+ self.editcounter += 1 # increment only on success
except pywikibot.EditConflict:
pywikibot.output(u'Skipping %s because of edit conflict'
% (page.title(),))
@@ -558,19 +561,20 @@
#Save the title for later processing instead of editing
self.editcounter += 1
self.articles.write(u'#%s\n%s'
- % (page.title(asLink=True, textlink=True),
- self.splitLine()))
+ % (page.title(asLink=True,
+ textlink=True),
+ self.splitLine()))
self.articles.flush()
#Finally:
self.writeEditCounter()
self.writeExceptCounter()
+
def prepareRegexForMySQL(pattern):
pattern = pattern.replace('\s', '[:space:]')
pattern = pattern.replace('\d', '[:digit:]')
pattern = pattern.replace('\w', '[:alnum:]')
-
pattern = pattern.replace("'", "\\" + "'")
#pattern = pattern.replace('\\', '\\\\')
#for char in ['[', ']', "'"]:
@@ -594,8 +598,8 @@
'text-contains': [],
'inside': [],
'inside-tags': [],
- 'require-title': [], # using a seperate requirements dict needs some
- } # major refactoring of code.
+ 'require-title': [], # using a seperate requirements dict needs some
+ } # major refactoring of code.
# Should the elements of 'replacements' and 'exceptions' be interpreted
# as regular expressions?
@@ -634,17 +638,16 @@
# too much CPU
sleep = None
# Do not save the page titles, rather work on wiki
- filename = None # The name of the file to save titles
- titlefile = None # The file object itself
+ filename = None # The name of the file to save titles
+ titlefile = None # The file object itself
# If we save, primary behaviour is append rather then new file
append = True
# Default: don't write titles to exception file and don't read them.
- excoutfilename = None # The name of the file to save exceptions
- excoutfile = None # The file object itself
+ excoutfilename = None # The name of the file to save exceptions
+ excoutfile = None # The file object itself
# excinfilename: reserved for later use (reading back exceptions)
# If we save exceptions, primary behaviour is append
excappend = True
-
# Read commandline parameters.
for arg in pywikibot.handleArgs(*args):
@@ -661,7 +664,7 @@
xmlFilename = i18n.input('pywikibot-enter-xml-filename')
else:
xmlFilename = arg[5:]
- elif arg =='-sql':
+ elif arg == '-sql':
useSql = True
elif arg.startswith('-page'):
if len(arg) == 5:
@@ -706,7 +709,7 @@
try:
commandline_replacements.extend(
[x.lstrip(u'\uFEFF').rstrip('\r\n')
- for x in codecs.open(replacefile, 'r', 'utf-8')])
+ for x in codecs.open(replacefile, 'r', 'utf-8')])
except IOError:
raise pywikibot.Error(
'\n%s cannot be opened. Try again :-)' % replacefile)
@@ -764,12 +767,12 @@
commandline_replacements[1])})
elif (len(commandline_replacements) > 1):
if (fix is None):
- for i in xrange (0, len(commandline_replacements), 2):
+ for i in xrange(0, len(commandline_replacements), 2):
replacements.append((commandline_replacements[i],
commandline_replacements[i + 1]))
if not summary_commandline:
- pairs = [( commandline_replacements[i],
- commandline_replacements[i + 1] )
+ pairs = [(commandline_replacements[i],
+ commandline_replacements[i + 1])
for i in range(0, len(commandline_replacements), 2)]
replacementsDescription = '(%s)' % ', '.join(
[('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
@@ -778,8 +781,8 @@
{'description':
replacementsDescription})
else:
- raise pywikibot.Error(
- 'Specifying -fix with replacements is undefined')
+ raise pywikibot.Error(
+ 'Specifying -fix with replacements is undefined')
elif fix is None:
old = pywikibot.input(u'Please enter the text that should be replaced:')
new = pywikibot.input(u'Please enter the new text:')
@@ -787,8 +790,8 @@
replacements.append((old, new))
while True:
old = pywikibot.input(
- u'Please enter another text that should be replaced,' +
- u'\nor press Enter to start:')
+ u'Please enter another text that should be replaced,\n'
+ u'or press Enter to start:')
if old == '':
change += ')'
break
@@ -810,7 +813,7 @@
else:
# Perform one of the predefined actions.
- fixname = fix # Save the name for passing to exceptions function.
+ fixname = fix # Save the name for passing to exceptions function.
try:
fix = fixes.fixes[fix]
except KeyError:
@@ -839,8 +842,8 @@
baseExcDict = incl
except NameError:
pywikibot.output(
- u'\nIncluded exceptions dictionary does not exist.' +
- u' Continuing with the exceptions\ngiven in fix.\n')
+ u'\nIncluded exceptions dictionary does not exist.'
+ u' Continuing with the exceptions\ngiven in fix.\n')
baseExcDict = None
if baseExcDict:
for l in baseExcDict:
@@ -879,8 +882,8 @@
oldR = re.compile(old, flags)
replacements[i] = oldR, new
- for exceptionCategory in [
- 'title', 'require-title', 'text-contains', 'inside']:
+ for exceptionCategory in ['title', 'require-title',
+ 'text-contains', 'inside']:
if exceptionCategory in exceptions:
patterns = exceptions[exceptionCategory]
if not regex:
@@ -943,8 +946,8 @@
if excoutfilename:
try:
excoutfile = codecs.open(
- excoutfilename, encoding='utf-8',
- mode=(lambda x: x and 'a' or 'w')(excappend))
+ excoutfilename, encoding='utf-8',
+ mode=(lambda x: x and 'a' or 'w')(excappend))
except IOError:
pywikibot.output("%s cannot be opened for writing." %
excoutfilename)
diff --git a/replicate_wiki.py b/replicate_wiki.py
index d6e9ba2..17f9383 100644
--- a/replicate_wiki.py
+++ b/replicate_wiki.py
@@ -1,26 +1,29 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-'''
-This bot replicates all pages (from specific namespaces) in a wiki to a second wiki within one family.
+"""
+This bot replicates all pages (from specific namespaces) in a wiki to a second
+wiki within one family.
Example:
python replicate_wiki.py [-r] -ns 10 -f wikipedia -o nl li fy
-to copy all templates from an nlwiki to liwiki and fywiki. It will show which pages have to be changed
+to copy all templates from an nlwiki to liwiki and fywiki. It will show which
+pages have to be changed
if -r is not present, and will only actually write pages if -r /is/ present.
-You can add replicate_replace to your user_config.py, which has the following format:
+You can add replicate_replace to your user_config.py, which has the following
+format:
replicate_replace = {
'wikipedia:li': {'Hoofdpagina': 'Veurblaad'}
}
-to replace all occurences of 'Hoofdpagina' with 'Veurblaad' when writing to liwiki. Note that this does
-not take the origin wiki into account.
-'''
+to replace all occurences of 'Hoofdpagina' with 'Veurblaad' when writing to
+liwiki. Note that this does not take the origin wiki into account.
+"""
#
# (C) Kasper Souren 2012-2013
-# (C) 2013 Pywikipediabot team
+# (C) 2013 Pywikibot team
#
# Distributed under the terms of the MIT license.
#
@@ -32,26 +35,27 @@
from wikipedia import *
from itertools import imap
+
def namespaces(site):
- '''dict from namespace number to prefix'''
- ns = dict(map(lambda n: (site.getNamespaceIndex(n), n),
+ """dict from namespace number to prefix"""
+ ns = dict(map(lambda n: (site.getNamespaceIndex(n), n),
site.namespaces()))
ns[0] = ''
return ns
def multiple_replace(text, word_dict):
- '''Replace all occurrences in text of key value pairs in word_dict'''
+ """Replace all occurrences in text of key value pairs in word_dict"""
for key in word_dict:
text = text.replace(key, word_dict[key])
return text
class SyncSites:
- '''Work is done in here.'''
+ """Work is done in here."""
def __init__(self, options):
- self.options = options
+ self.options = options
if options.original_wiki:
original_wiki = options.original_wiki
@@ -61,11 +65,9 @@
print "Syncing from " + original_wiki
family = options.family or config.family
-
sites = options.destination_wiki
-
self.original = getSite(original_wiki, family)
-
+
if options.namespace and 'help' in options.namespace:
nsd = namespaces(self.original)
for k in nsd:
@@ -76,7 +78,7 @@
self.differences = {}
self.user_diff = {}
- print 'Syncing to',
+ print 'Syncing to',
for s in self.sites:
self.differences[s] = []
self.user_diff[s] = []
@@ -84,16 +86,17 @@
print
def check_sysops(self):
- '''Check if sysops are the same
+ """Check if sysops are the same
TODO: make optional
- '''
+
+ """
def get_users(site):
userlist = site.getUrl(site.get_address('Special:Userlist&group=sysop'))
# Hackery but working. At least on MW 1.15.0
# User namespace is number 2
return set(re.findall(site.namespace(2) + ':(\w+)["\&]', userlist))
-
+
ref_users = get_users(self.original)
for site in self.sites:
users = get_users(site)
@@ -102,18 +105,18 @@
self.user_diff[site] = diff
def check_namespaces(self):
- '''Check all namespaces, to be ditched for clarity'''
+ """Check all namespaces, to be ditched for clarity"""
namespaces = [
- 0, # Main
- 8, # MediaWiki
- 152, # DPL
- 102, # Eigenschap
- 104, # Type
- 106, # Formulier
- 108, # Concept
- 10, # Sjabloon
- ]
-
+ 0, # Main
+ 8, # MediaWiki
+ 152, # DPL
+ 102, # Eigenschap
+ 104, # Type
+ 106, # Formulier
+ 108, # Concept
+ 10, # Sjabloon
+ ]
+
if self.options.namespace:
print options.namespace
namespaces = [int(options.namespace)]
@@ -123,13 +126,13 @@
self.check_namespace(ns)
def check_namespace(self, namespace):
- '''Check an entire namespace'''
+ """Check an entire namespace"""
print "\nCHECKING NAMESPACE", namespace
pages = imap(lambda p: p.title(),
- self.original.allpages('!', namespace))
+ self.original.allpages('!', namespace))
for p in pages:
- if not p in ['MediaWiki:Sidebar', 'MediaWiki:Mainpage',
+ if not p in ['MediaWiki:Sidebar', 'MediaWiki:Mainpage',
'MediaWiki:Sitenotice', 'MediaWiki:MenuSidebar']:
try:
self.check_page(p)
@@ -139,32 +142,32 @@
print 'error: Redirectpage - todo: handle gracefully'
print
-
def generate_overviews(self):
- '''Create page on wikis with overview of bot results'''
+ """Create page on wikis with overview of bot results"""
for site in self.sites:
sync_overview_page = Page(site, 'User:' + site.loggedInAs() + '/sync.py overview')
output = "== Pages that differ from original ==\n\n"
if self.differences[site]:
- output += "".join(map(lambda l: '* [[:' + l + "]]\n", self.differences[site]))
+ output += "".join(map(lambda l: '* [[:' + l + "]]\n",
+ self.differences[site]))
else:
output += "All important pages are the same"
-
+
output += "\n\n== Admins from original that are missing here ==\n\n"
if self.user_diff[site]:
- output += "".join(map(lambda l: '* ' + l.replace('_', ' ') + "\n", self.user_diff[site]))
+ output += "".join(map(lambda l: '* ' + l.replace('_', ' ') + "\n",
+ self.user_diff[site]))
else:
output += "All users from original are also present on this wiki"
print output
sync_overview_page.put(output, self.put_message(site))
-
def put_message(self, site):
return site.loggedInAs() + ' sync.py synchronization from ' + str(self.original)
def check_page(self, pagename):
- '''Check one page'''
+ """Check one page"""
print "\nChecking", pagename,
sys.stdout.flush()
@@ -180,16 +183,16 @@
print "\nCross namespace, new title: ", new_pagename
else:
new_pagename = pagename
-
+
page2 = Page(site, new_pagename)
if page2.exists():
txt2 = page2.get()
-
else:
txt2 = ''
-
- if config.replicate_replace.has_key(str(site)):
- txt_new = multiple_replace(txt1, config.replicate_replace[str(site)])
+
+ if str(site) in config.replicate_replace:
+ txt_new = multiple_replace(txt1,
+ config.replicate_replace[str(site)])
if txt1 != txt_new:
print 'NOTE: text replaced using config.sync_replace'
print txt1, txt_new, txt2
@@ -198,9 +201,8 @@
if txt1 != txt2:
print "\n", site, 'DIFFERS'
self.differences[site].append(pagename)
-
- if self.options.replace:
- page2.put(txt1, self.put_message(site))
+ if self.options.replace:
+ page2.put(txt1, self.put_message(site))
else:
sys.stdout.write('.')
sys.stdout.flush()
@@ -223,7 +225,7 @@
help="specify namespace")
parser.add_argument("-dns", "--dest-namespace", dest="dest_namespace",
help="destination namespace (if different)")
-
+
(options, args) = parser.parse_known_args()
# sync is global for convenient IPython debugging
@@ -231,4 +233,3 @@
sync.check_sysops()
sync.check_namespaces()
sync.generate_overviews()
-
diff --git a/revertbot.py b/revertbot.py
index 7f54cf5..dc58164 100644
--- a/revertbot.py
+++ b/revertbot.py
@@ -33,7 +33,8 @@
'uclimit': '500',
'ucuser': self.site.username(),
}
- if ns: predata['ucnamespace'] = ns
+ if ns:
+ predata['ucnamespace'] = ns
if max < 500 and max != -1:
predata['uclimit'] = str(max)
@@ -103,8 +104,10 @@
rev = page['revisions'][1]
comment = u'Reverted to revision %s by %s on %s' % (rev['revid'],
- rev['user'], rev['timestamp'])
- if self.comment: comment += ': ' + self.comment
+ rev['user'],
+ rev['timestamp'])
+ if self.comment:
+ comment += ': ' + self.comment
page = pywikibot.Page(self.site, item['title'])
pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
@@ -125,7 +128,7 @@
def callback(self, item):
if 'top' in item:
page = pywikibot.Page(self.site, item['title'])
- text=page.get()
+ text = page.get()
pattern = re.compile(u'\[\[.+?:.+?\..+?\]\]', re.UNICODE)
return pattern.search(text) >= 0
return False
--
To view, visit https://gerrit.wikimedia.org/r/103055
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7e5b91177806e19100e8e140f4e8674397b9a522
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/compat
Gerrit-Branch: master
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot