[Pywikipedia-svn] SVN: [8283] trunk/pywikipedia/imagecopy_enwp.py

12 Jun 2010

Revision: 8283
Author:   multichill
Date:     2010-06-12 22:36:27 +0000 (Sat, 12 Jun 2010)
Log Message:
-----------
* Added a templates skip list
* Fixed Licensing header detection
* Improved the {{information}} detection.
Modified Paths:
--------------
    trunk/pywikipedia/imagecopy_enwp.py
Modified: trunk/pywikipedia/imagecopy_enwp.py
===================================================================

--- trunk/pywikipedia/imagecopy_enwp.py	2010-06-12 13:39:27 UTC (rev 8282)
+++ trunk/pywikipedia/imagecopy_enwp.py	2010-06-12 22:36:27 UTC (rev 8283)
@@ -73,6 +73,41 @@
     'en': u'[[:File:%s|File]] moved to [[:commons:File:%s|commons]].',
 }
+skipTemplates = [u'Db-f1',
+                 u'Db-f2',
+                 u'Db-f3',
+                 u'Db-f7',
+                 u'Db-f8',
+                 u'Db-f9',
+                 u'Db-f10',
+                 u'NowCommons',
+                 u'CommonsNow',
+                 u'Nowcommons',
+                 u'NowCommonsThis',
+                 u'Nowcommons2',
+                 u'NCT',
+                 u'Nowcommonsthis',
+                 u'Moved to commons',
+                 u'Now Commons',
+                 u'Now at commons',
+                 u'Db-nowcommons',
+                 u'WikimediaCommons',
+                 u'Now commons',
+                 u'Di-no source',
+                 u'Di-no license',
+                 u'Di-no permission',
+                 u'Di-orphaned fair use',
+                 u'Di-no source no license',
+                 u'Di-replaceable fair use',
+                 u'Di-no fair use rationale',
+                 u'Di-disputed fair use rationale',
+                 u'Puf',
+                 u'PUI',
+                 u'Pui',
+                 u'Ffd',
+                 ]
+                 
+
 licenseTemplates = [(u'{{(self|self2)|([^}]+)}}', u'{{Self|\2|author=[[:%(lang)s:User:%(author)s|%(author)s]] at [http://%(lang)s.%(family)s.org %(lang)s.%(family)s]}}'),
                     (u'{{(GFDL-self|GFDL-self-no-disclaimers)|([^}]+)}}', u'{{Self|GFDL|\2|author=[[:%(lang)s:User:%(author)s|%(author)s]] at [http://%(lang)s.%(family)s.org %(lang)s.%(family)s]}}'),
                     (u'{{GFDL-self-with-disclaimers|([^}]+)}}', u'{{Self|GFDL-with-disclaimers|\1|author=[[:%(lang)s:User:%(author)s|%(author)s]] at [http://%(lang)s.%(family)s.org %(lang)s.%(family)s]}}'),
@@ -82,7 +117,7 @@
                     ]
sourceGarbage =     [u'== Summary ==',
-                     u'== Licensing ==',
+                     u'== Licensing:? ==',
                     ]
class Tkdialog:
@@ -227,19 +262,15 @@
         return (self.filename, self.description, self.date, self.source, self.author, self.licensetemplate, self.categories, self.skip)
-def doiskip(pagetext):
+def doiskip(imagepage):
     '''
     Skip this image or not.
     Returns True if the image is on the skip list, otherwise False
-    
-    saltos=getautoskip()
-    #print saltos
-    for salto in saltos:
-        rex=ur'{{\s*['+salto[0].upper()+salto[0].lower()+']'+salto[1:]+'(}}||)'
-        #print rex
-        if re.search(rex, pagetext):
+    '''
+    for template in imagepage.templates():
+        if template in skipTemplates:
+            wikipedia.output(u'Found ' + template + u' which is on the template skip list')
             return True
-    '''
     return False
def getNewFields(imagepage):
@@ -266,20 +297,26 @@
     other_versions = u''
     text = imagepage.get()
     # Need to add the permission field
-    regex =u'{{Information[\s\r\n]*|[\s\r\n]*description[\s\r\n]*=(?P<description>.*)|[\s\r\n]*source[\s\r\n]*=(?P<source>.*)|[\s\r\n]*date[\s\r\n]*=(?P<date>.*)|[\s\r\n]*author[\s\r\n]*=(?P<author>.*)(|[\s\r\n]*permission.*=(?P<permission>[^}]*))?(|[\s\r\n]*other_versions.*=(?P<other_versions>[^}]*))?}}'
+    regexes =[u'{{Information[\s\r\n]*|[\s\r\n]*description[\s\r\n]*=(?P<description>.*)|[\s\r\n]*source[\s\r\n]*=(?P<source>.*)|[\s\r\n]*date[\s\r\n]*=(?P<date>.*)|[\s\r\n]*author[\s\r\n]*=(?P<author>.*)|[\s\r\n]*permission.*=(?P<permission>[^}]*)|[\s\r\n]*other_versions.*=(?P<other_versions>[^}]*)}}',
+              u'{{Information[\s\r\n]*|[\s\r\n]*description[\s\r\n]*=(?P<description>.*)|[\s\r\n]*source[\s\r\n]*=(?P<source>.*)|[\s\r\n]*date[\s\r\n]*=(?P<date>.*)|[\s\r\n]*author[\s\r\n]*=(?P<author>.*)|[\s\r\n]*other_versions.*=(?P<other_versions>[^}]*)}}',              
+              ]
+
-    match =re.search(regex, text, re.IGNORECASE|re.DOTALL)
-    if match:
-        description = convertLinks(match.group(u'description').strip(), imagepage.site())
-        date = match.group(u'date').strip()
-        source = getSource(imagepage, source=convertLinks(match.group(u'source').strip(), imagepage.site()))
-        author = convertLinks(match.group(u'author').strip(), imagepage.site())
-        if match.group(u'permission'):
-            print u'permission'
-            permission = convertLinks(match.group(u'permission').strip(), imagepage.site())
-        if match.group(u'other_versions'):
-            print u'other_versions'
-            other_versions = convertLinks(match.group(u'other_versions').strip(), imagepage.site())       
+    for regex in regexes:
+        match =re.search(regex, text, re.IGNORECASE|re.DOTALL)
+        if match:
+            description = convertLinks(match.group(u'description').strip(), imagepage.site())
+            date = match.group(u'date').strip()
+            source = getSource(imagepage, source=convertLinks(match.group(u'source').strip(), imagepage.site()))
+            author = convertLinks(match.group(u'author').strip(), imagepage.site())
+            if u'permission' in match.groupdict():
+                permission = convertLinks(match.group(u'permission').strip(), imagepage.site())
+            if  u'other_versions' in match.groupdict():
+                other_versions = convertLinks(match.group(u'other_versions').strip(), imagepage.site())
+            # Return the stuff we found
+            return (description, date, source, author)
+    
+    #We didn't find anything, return the empty strings
     return (description, date, source, author)
def getNewFieldsFromFreetext(imagepage):
@@ -417,7 +454,7 @@
         imagepage = wikipedia.ImagePage(page.site(), page.title())
#First do autoskip.
-        if doiskip(imagepage.get()):
+        if doiskip(imagepage):
             wikipedia.output("Skipping " + page.title())
             skip = True
         else:

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Pywikipedia-svn] SVN: [8283] trunk/pywikipedia/imagecopy_enwp.py