[Pywikipedia-l] SVN: [4087] trunk/pywikipedia/copyright.py

22 Aug 2007

Revision: 4087
Author:   cosoleto
Date:     2007-08-22 14:40:30 +0000 (Wed, 22 Aug 2007)
Log Message:
-----------
code cleanup, improvements in cleanwikicode()
Modified Paths:
--------------
    trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py
===================================================================

--- trunk/pywikipedia/copyright.py	2007-08-22 08:50:02 UTC (rev 4086)
+++ trunk/pywikipedia/copyright.py	2007-08-22 14:40:30 UTC (rev 4087)
@@ -71,7 +71,6 @@
__version__='$Id$'
-#search_in_msn = False
 exclude_quote = True
appdir = "copyright/"
@@ -250,12 +249,12 @@
                 print 'Getting page failed'
     return
-def check_list(text, cl, debug=False):
+def check_list(text, cl, verbose = False):
     for entry in cl:
         if entry:
             if text.find(entry) != -1:
                 #print entry
-                if debug:
+                if verbose:
                     print 'SKIP URL ' + text
                 return True
@@ -299,9 +298,11 @@
     text = u""
     f = codecs.open(filename, 'r','utf-8')
     text = f.read()
+    f.close()
+
     if cut_comment:
         text = re.sub(" ?#.*", "", text)
-    f.close()
+
     return text
def write_log(text, filename = output_file):
@@ -329,10 +330,10 @@
 def cleanwikicode(text):
     if not text:
         return ""
-    #write_log(text+'\n', "debug_cleanwikicode1.txt")
-    text = re.sub('(?i)<p.*?>' ,'', text)
-    text = re.sub('(?i)</?div.*?>' ,'', text)
-    text = re.sub("(?i)</*small>", "", text)
+
+    #write_log(text+'\n', "copyright/debug_cleanwikicode1.txt")
+
+    text = re.sub('(?i)</?(p|u|i|b|em|div|span|font|small|big|code|tt).*?>', '', text)
     text = re.sub('(?i)<(/\s*)?br(\s*/)?>', '', text)
     text = re.sub('<!--.*?-->', '', text)
     text = re.sub('&lt;', '<', text)
@@ -345,34 +346,38 @@
         text = re.sub('^[:*]?\s*[«][^»]+[»].?\s*(((|<ref>).*?()|</ref>))?.?$', "", text)
         text = re.sub('^[:*]?\s*[“][^”]+[”].?\s*(((|<ref>).*?()|</ref>))?.?$', "", text)
-    # exclude <ref> notes
-    text = re.sub ("<ref.*?>.*?</ref>", "", text)
-    # exclude wikitable
-    text = re.sub('(?m)^[ \t]*({||[|!]).*', "", text)
     # remove URL
     text = re.sub('https?://[\w/.,;:@&=%#\?_!~*'|()"+-]+', ' ', text)
+
     # remove Image tags
     text = reImageC.sub("", text)
+
     # replace piped wikilink
     text = re.sub("[[[^]]*?|(.*?)]]", "\1", text)
+
     # remove unicode and polytonic template
     text = re.sub("(?i){{(unicode|polytonic)|(.*?)}}", "\1", text)
-    # remove <nowiki> tags
-    text = re.sub("</*nowiki>", "", text)
-    # remove template
-    text = re.sub('{{.*?}}', '', text)
-    # remove LaTeX staff
-    text = re.sub('<math>.*?</math>', '', text)
-    #text = text.replace("''", "")
-    text = text.replace("[", "")
-    text = text.replace("]", "")
-    text = re.sub('^[*:;]', '', text)
-    text = text.replace("<!--", "")
-    text = text.replace("-->", "")
+    text = re.sub("""
+    (?xim)
+    (
+        <ref.*?>.*?</ref>    | # exclude <ref> notes
+        ^[\ \t]*({||[|!]).* | # exclude wikitable
+        </*nowiki>           | # remove <nowiki> tags
+        {{.*?}}              | # remove template
+        <math>.*?</math>     | # remove LaTeX staff
+        [[]]               | # remove [, ]
+        ^[*:;]+              | # remove *, :, ; in begin of line
+        <!--                 |
+        -->                  |
+    )
+    """, "", text)
+    # remove useless spaces
+    text = re.sub("(?m)(^[ \t]+|[ \t]+$)", "", text)
+
     #if text:
-    #    write_log(text+'\n', "debug_cleanwikicode2.txt")
+    #    write_log(text+'\n', "copyright/debug_cleanwikicode2.txt")
     return text
excl_list = exclusion_list()
@@ -522,7 +527,7 @@
     if config.copyright_google:
         import google
         google.LICENSE_KEY = config.google_key
-        print "  google query..."
+        print "  Google query..."
         search_request_retry = config.copyright_connection_tries
         while search_request_retry:
             try:
@@ -543,7 +548,7 @@
                     search_request_retry -= 1
     if config.copyright_yahoo:
         import yahoo.search.web
-        print "  yahoo query..."
+        print "  Yahoo query..."
         data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"' +
                                           query.encode('utf_8') +
                                           '" -Wikipedia', results=numresults)
@@ -588,7 +593,7 @@
offset = 0
     for i in range(len(url)):
-        if check_list(url[i + offset][0], excl_list, debug = True):
+        if check_list(url[i + offset][0], excl_list, verbose = True):
             url.pop(i + offset)
             offset += -1
     return url

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

[Pywikipedia-l] SVN: [4087] trunk/pywikipedia/copyright.py