[Pywikipedia-l] SVN: [5300] trunk/pywikipedia/reflinks.py
nicdumz at svn.wikimedia.org
nicdumz at svn.wikimedia.org
Sat May 3 12:48:25 UTC 2008
Revision: 5300
Author: nicdumz
Date: 2008-05-03 12:48:24 +0000 (Sat, 03 May 2008)
Log Message:
-----------
* Correcting linklist URL
* adding -ingnorepdf
Modified Paths:
--------------
trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py 2008-05-03 07:59:59 UTC (rev 5299)
+++ trunk/pywikipedia/reflinks.py 2008-05-03 12:48:24 UTC (rev 5300)
@@ -27,6 +27,9 @@
-xmlstart Page to start with when using an XML dump
+-ignorepdf Do not handle PDF files (handy if you use Windows and
+ can't get pdfinfo)
+
Basic pagegenerators commands, -page, etc...
"""
# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
@@ -87,7 +90,7 @@
ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'+
# unbracketed without ()
ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*</ref>')
-listof404pages = 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz'
+listof404pages = 'http://www.twoevils.org/files/wikipedia/404-links.txt'
class XmlDumpPageGenerator:
def __init__(self, xmlFilename, xmlStart, namespaces):
@@ -183,10 +186,11 @@
self.title = self.title.title()
class ReferencesRobot:
- def __init__(self, generator, acceptall = False, limit = None):
+ def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ):
self.generator = generator
self.acceptall = acceptall
self.limit = limit
+ self.ignorepdf = ignorepdf
self.site = wikipedia.getSite()
self.stopPage = wikipedia.translate(self.site, stopPage)
self.stopPageRevId = wikipedia.Page(self.site,
@@ -318,7 +322,7 @@
headers = f.info()
contentType = headers.getheader('Content-Type')
if contentType and not self.MIME.search(contentType):
- if ref.link.lower().endswith('.pdf'):
+ if ref.link.lower().endswith('.pdf') and not ignorepdf:
# If file has a PDF suffix
self.getPDFTitle(ref, f)
else:
@@ -490,6 +494,7 @@
PageTitles = []
xmlFilename = None
always = False
+ ignorepdf = False
limit = None
namespaces = []
generator = None
@@ -505,6 +510,8 @@
wikipedia.setAction(arg[9:])
elif arg == '-always':
always = True
+ elif arg == '-ignorepdf':
+ ignorepdf= True
elif arg.startswith('-limit:'):
limit = int(arg[7:])
elif arg.startswith('-xmlstart'):
@@ -538,7 +545,7 @@
sys.exit()
generator = pagegenerators.PreloadingGenerator(generator, pageNumber = 50)
generator = pagegenerators.RedirectFilterPageGenerator(generator)
- bot = ReferencesRobot(generator, always, limit)
+ bot = ReferencesRobot(generator, always, limit, ignorepdf)
bot.run()
if __name__ == "__main__":
More information about the Pywikipedia-l
mailing list