Revision: 3934
Author: valhallasw
Date: 2007-08-01 19:53:05 +0000 (Wed, 01 Aug 2007)
Log Message:
-----------
Changed Site('de', 'wikipedia') comparison to 'wikipedia:de'. The latter is fine with wikipedia_family.py missing, the first is not.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-08-01 16:37:37 UTC (rev 3933)
+++ trunk/pywikipedia/wikipedia.py 2007-08-01 19:53:05 UTC (rev 3934)
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
"""
Library to get and put pages on a MediaWiki.
@@ -2850,7 +2850,7 @@
if site is None:
site = getSite()
- if site == Site('de', 'wikipedia'):
+ if site.sitename() == 'wikipedia:de':
raise Error('The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv2#Positio…')
s = categoryFormat(new, insite = site)
Revision: 3933
Author: wikipedian
Date: 2007-08-01 16:37:37 +0000 (Wed, 01 Aug 2007)
Log Message:
-----------
fixed handling of URLs in multi-line templates, e.g. the infobox in
http://de.wikipedia.org/wiki/Camon_%28Ari%C3%A8ge%29
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2007-08-01 16:24:02 UTC (rev 3932)
+++ trunk/pywikipedia/weblinkchecker.py 2007-08-01 16:37:37 UTC (rev 3933)
@@ -107,7 +107,7 @@
# might be a | or a } directly after a URL which does not belong to
# the URL itself.
# Blow up templates with spaces to avoid these problems.
- templateWithParamsR = re.compile(r'{{(.*?[^ ])\|([^ ].*?)}}')
+ templateWithParamsR = re.compile(r'{{(.*?[^ ])\|([^ ].*?)}}', re.DOTALL)
while templateWithParamsR.search(text):
text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
Revision: 3932
Author: wikipedian
Date: 2007-08-01 16:24:02 +0000 (Wed, 01 Aug 2007)
Log Message:
-----------
don't crash when the Internet Archive gives a 403, e.g. on
http://web.archive.org/web/*/http://highmarkfunds.stockpoint.com/highmarkfu…
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2007-08-01 16:17:20 UTC (rev 3931)
+++ trunk/pywikipedia/weblinkchecker.py 2007-08-01 16:24:02 UTC (rev 3932)
@@ -147,7 +147,12 @@
def getArchiveURL(self):
wikipedia.output(u'Consulting the Internet Archive for %s' % self.url)
archiveURL = 'http://web.archive.org/web/*/%s' % self.url
- f = urllib2.urlopen(archiveURL)
+ try:
+ f = urllib2.urlopen(archiveURL)
+ except urllib2.HTTPError:
+ # The Internet Archive yields a 403 error when the site was not
+ # archived due to robots.txt restrictions.
+ return None
text = f.read()
if text.find("Search Results for ") != -1:
return archiveURL
Revision: 3931
Author: wikipedian
Date: 2007-08-01 16:17:20 +0000 (Wed, 01 Aug 2007)
Log Message:
-----------
added -repeat parameter: Loads all wiki pages where dead links were
found during a prior run
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2007-08-01 16:05:20 UTC (rev 3930)
+++ trunk/pywikipedia/weblinkchecker.py 2007-08-01 16:17:20 UTC (rev 3931)
@@ -13,6 +13,9 @@
two times, with a time lag of at least one week. Such links will be logged to a
.txt file in the deadlinks subdirectory.
+After running the bot and waiting for at least one weak, you can re-check those
+pages where dead links where found, using the -repeat parameter.
+
In addition to the logging step, it is possible to automatically report dead
links to the talk page of the article where the link was found. To use this
feature, set report_dead_links_on_talk = True in your user-config.py, or
@@ -30,8 +33,14 @@
Loads all wiki pages using the Special:Allpages feature, starting at
"Example page"
+ python weblinkchecker.py -weblink:www.example.org
+ Loads all wiki pages that link to www.example.org
+
python weblinkchecker.py Example page
Only checks links found in the wiki page "Example page"
+
+ python weblinkchecker.py -repeat
+ Loads all wiki pages where dead links were found during a prior run
"""
#
@@ -571,6 +580,19 @@
thread.setDaemon(True)
thread.start()
+def RepeatPageGenerator():
+ history = History(None)
+ pageTitles = set()
+ for (key, value) in history.historyDict.iteritems():
+ for entry in value:
+ pageTitle = entry[0]
+ pageTitles.add(pageTitle)
+ pageTitles = list(pageTitles)
+ pageTitles.sort()
+ for pageTitle in pageTitles:
+ page = wikipedia.Page(wikipedia.getSite(), pageTitle)
+ yield page
+
def countLinkCheckThreads():
i = 0
for thread in threading.enumerate():
@@ -597,6 +619,8 @@
config.report_dead_links_on_talk = False
elif arg.startswith('-namespace:'):
namespaces.append(int(arg[11:]))
+ elif arg == '-repeat':
+ gen = RepeatPageGenerator()
else:
generator = genFactory.handleArg(arg)
if generator:
Revision: 3928
Author: wikipedian
Date: 2007-08-01 00:30:33 +0000 (Wed, 01 Aug 2007)
Log Message:
-----------
made it possible to use all the typical parameters such as -ref:,
-links:, -file:, and -weblink:.
added -namespace: parameter.
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2007-07-31 16:03:31 UTC (rev 3927)
+++ trunk/pywikipedia/weblinkchecker.py 2007-08-01 00:30:33 UTC (rev 3928)
@@ -9,7 +9,7 @@
The bot will store all links found dead in a .dat file in the deadlinks
subdirectory. To avoid the removing of links which are only temporarily
-unavailable, the bot only reports links which were reported dead at least
+unavailable, the bot ONLY reports links which were reported dead at least
two times, with a time lag of at least one week. Such links will be logged to a
.txt file in the deadlinks subdirectory.
@@ -29,7 +29,7 @@
python weblinkchecker.py -start:Example_page
Loads all wiki pages using the Special:Allpages feature, starting at
"Example page"
-
+
python weblinkchecker.py Example page
Only checks links found in the wiki page "Example page"
"""
@@ -91,22 +91,6 @@
re.compile('.*[\./(a)]berlinonline.de(/.*)?'), # a de: user wants to fix them by hand and doesn't want them to be deleted, see [[de:Benutzer:BLueFiSH.as/BZ]].
]
-class Global(object):
- talk = config.report_dead_links_on_talk
-
- def handleArgs(self, args):
- unhandledArguments = []
- for arg in args:
- if arg == '-talk':
- self.talk = True
- elif arg == '-notalk':
- self.talk = False
- else:
- unhandledArguments.append(arg)
- return unhandledArguments
-
-globalvar = Global()
-
def weblinksIn(text, withoutBracketed = False, onlyBracketed = False):
text = wikipedia.removeDisabledParts(text)
@@ -546,10 +530,9 @@
Robot which will use several LinkCheckThreads at once to search for dead
weblinks on pages provided by the given generator.
'''
- def __init__(self, generator, start ='!'):
+ def __init__(self, generator):
self.generator = generator
- self.start = start
- if globalvar.talk:
+ if config.report_dead_links_on_talk:
#wikipedia.output("Starting talk page thread")
reportThread = DeadLinkReportThread()
# thread dies when program terminates
@@ -598,25 +581,37 @@
def main():
gen = None
- start = '!'
- pageTitle = []
- args = wikipedia.handleArgs()
- args = globalvar.handleArgs(args)
-
- for arg in args:
- if arg.startswith('-start:'):
- start = arg[7:]
+ singlePageTitle = []
+ # Which namespaces should be processed?
+ # default to [] which means all namespaces will be processed
+ namespaces = []
+ # This factory is responsible for processing command line arguments
+ # that are also used by other scripts and that determine on which pages
+ # to work on.
+ genFactory = pagegenerators.GeneratorFactory()
+
+ for arg in wikipedia.handleArgs():
+ if arg == '-talk':
+ config.report_dead_links_on_talk = True
+ elif arg == '-notalk':
+ config.report_dead_links_on_talk = False
+ elif arg.startswith('-namespace:'):
+ namespaces.append(int(arg[11:]))
else:
- pageTitle.append(arg)
+ generator = genFactory.handleArg(arg)
+ if generator:
+ gen = generator
+ else:
+ singlePageTitle.append(arg)
- if pageTitle:
- pageTitle = ' '.join(pageTitle)
- page = wikipedia.Page(wikipedia.getSite(), pageTitle)
+ if singlePageTitle:
+ singlePageTitle = ' '.join(singlePageTitle)
+ page = wikipedia.Page(wikipedia.getSite(), singlePageTitle)
gen = iter([page])
- else:
- gen = pagegenerators.AllpagesPageGenerator(start)
if gen:
+ if namespaces != []:
+ gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
gen = pagegenerators.PreloadingGenerator(gen, pageNumber = 240)
gen = pagegenerators.RedirectFilterPageGenerator(gen)
bot = WeblinkCheckerRobot(gen)
@@ -651,7 +646,7 @@
bot.history.save()
else:
wikipedia.showHelp()
-
+
if __name__ == "__main__":
try:
main()