[Pywikipedia-l] SVN: [5300] trunk/pywikipedia/reflinks.py

nicdumz at svn.wikimedia.org nicdumz at svn.wikimedia.org
Sat May 3 12:48:25 UTC 2008


Revision: 5300
Author:   nicdumz
Date:     2008-05-03 12:48:24 +0000 (Sat, 03 May 2008)

Log Message:
-----------
* Correcting linklist URL
* adding -ingnorepdf

Modified Paths:
--------------
    trunk/pywikipedia/reflinks.py

Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py	2008-05-03 07:59:59 UTC (rev 5299)
+++ trunk/pywikipedia/reflinks.py	2008-05-03 12:48:24 UTC (rev 5300)
@@ -27,6 +27,9 @@
 
 -xmlstart               Page to start with when using an XML dump
 
+-ignorepdf              Do not handle PDF files (handy if you use Windows and 
+                        can't get pdfinfo)
+
 Basic pagegenerators commands, -page, etc...
 """
 # (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
@@ -87,7 +90,7 @@
 	ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'+
 	# unbracketed without ()
 	ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*</ref>')
-listof404pages = 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz'
+listof404pages = 'http://www.twoevils.org/files/wikipedia/404-links.txt'
 
 class XmlDumpPageGenerator:
     def __init__(self, xmlFilename, xmlStart, namespaces):
@@ -183,10 +186,11 @@
             self.title = self.title.title()
 
 class ReferencesRobot:
-    def __init__(self, generator, acceptall = False, limit = None):
+    def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ):
         self.generator = generator
         self.acceptall = acceptall
         self.limit = limit
+        self.ignorepdf = ignorepdf
         self.site = wikipedia.getSite()
         self.stopPage = wikipedia.translate(self.site, stopPage)
         self.stopPageRevId = wikipedia.Page(self.site, 
@@ -318,7 +322,7 @@
                     headers = f.info()
                     contentType = headers.getheader('Content-Type')
                     if contentType and not self.MIME.search(contentType):
-                        if ref.link.lower().endswith('.pdf'):
+                        if ref.link.lower().endswith('.pdf') and not ignorepdf:
                             # If file has a PDF suffix
                             self.getPDFTitle(ref, f)
                         else:
@@ -490,6 +494,7 @@
     PageTitles = []
     xmlFilename = None
     always = False
+    ignorepdf = False
     limit = None
     namespaces = []
     generator = None
@@ -505,6 +510,8 @@
             wikipedia.setAction(arg[9:])
         elif arg == '-always':
             always = True
+        elif arg == '-ignorepdf':
+            ignorepdf= True
         elif arg.startswith('-limit:'):
             limit = int(arg[7:])
         elif arg.startswith('-xmlstart'):
@@ -538,7 +545,7 @@
         sys.exit()
     generator = pagegenerators.PreloadingGenerator(generator, pageNumber = 50)
     generator = pagegenerators.RedirectFilterPageGenerator(generator)
-    bot = ReferencesRobot(generator, always, limit)
+    bot = ReferencesRobot(generator, always, limit, ignorepdf)
     bot.run()
 
 if __name__ == "__main__":





More information about the Pywikipedia-l mailing list