Revision: 4087 Author: cosoleto Date: 2007-08-22 14:40:30 +0000 (Wed, 22 Aug 2007)
Log Message: ----------- code cleanup, improvements in cleanwikicode()
Modified Paths: -------------- trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py =================================================================== --- trunk/pywikipedia/copyright.py 2007-08-22 08:50:02 UTC (rev 4086) +++ trunk/pywikipedia/copyright.py 2007-08-22 14:40:30 UTC (rev 4087) @@ -71,7 +71,6 @@
__version__='$Id$'
-#search_in_msn = False exclude_quote = True
appdir = "copyright/" @@ -250,12 +249,12 @@ print 'Getting page failed' return
-def check_list(text, cl, debug=False): +def check_list(text, cl, verbose = False): for entry in cl: if entry: if text.find(entry) != -1: #print entry - if debug: + if verbose: print 'SKIP URL ' + text return True
@@ -299,9 +298,11 @@ text = u"" f = codecs.open(filename, 'r','utf-8') text = f.read() + f.close() + if cut_comment: text = re.sub(" ?#.*", "", text) - f.close() + return text
def write_log(text, filename = output_file): @@ -329,10 +330,10 @@ def cleanwikicode(text): if not text: return "" - #write_log(text+'\n', "debug_cleanwikicode1.txt") - text = re.sub('(?i)<p.*?>' ,'', text) - text = re.sub('(?i)</?div.*?>' ,'', text) - text = re.sub("(?i)</*small>", "", text) + + #write_log(text+'\n', "copyright/debug_cleanwikicode1.txt") + + text = re.sub('(?i)</?(p|u|i|b|em|div|span|font|small|big|code|tt).*?>', '', text) text = re.sub('(?i)<(/\s*)?br(\s*/)?>', '', text) text = re.sub('<!--.*?-->', '', text) text = re.sub('<', '<', text) @@ -345,34 +346,38 @@ text = re.sub('^[:*]?\s*[«][^»]+[»].?\s*(((|<ref>).*?()|</ref>))?.?$', "", text) text = re.sub('^[:*]?\s*[“][^”]+[”].?\s*(((|<ref>).*?()|</ref>))?.?$', "", text)
- # exclude <ref> notes - text = re.sub ("<ref.*?>.*?</ref>", "", text) - # exclude wikitable - text = re.sub('(?m)^[ \t]*({||[|!]).*', "", text) # remove URL text = re.sub('https?://[\w/.,;:@&=%#\?_!~*'|()"+-]+', ' ', text) + # remove Image tags text = reImageC.sub("", text) + # replace piped wikilink text = re.sub("[[[^]]*?|(.*?)]]", "\1", text) + # remove unicode and polytonic template text = re.sub("(?i){{(unicode|polytonic)|(.*?)}}", "\1", text) - # remove <nowiki> tags - text = re.sub("</*nowiki>", "", text) - # remove template - text = re.sub('{{.*?}}', '', text) - # remove LaTeX staff - text = re.sub('<math>.*?</math>', '', text) - #text = text.replace("''", "") - text = text.replace("[", "") - text = text.replace("]", "") - text = re.sub('^[*:;]', '', text)
- text = text.replace("<!--", "") - text = text.replace("-->", "") + text = re.sub(""" + (?xim) + ( + <ref.*?>.*?</ref> | # exclude <ref> notes + ^[\ \t]*({||[|!]).* | # exclude wikitable + </*nowiki> | # remove <nowiki> tags + {{.*?}} | # remove template + <math>.*?</math> | # remove LaTeX staff + [[]] | # remove [, ] + ^[*:;]+ | # remove *, :, ; in begin of line + <!-- | + --> | + ) + """, "", text)
+ # remove useless spaces + text = re.sub("(?m)(^[ \t]+|[ \t]+$)", "", text) + #if text: - # write_log(text+'\n', "debug_cleanwikicode2.txt") + # write_log(text+'\n', "copyright/debug_cleanwikicode2.txt") return text
excl_list = exclusion_list() @@ -522,7 +527,7 @@ if config.copyright_google: import google google.LICENSE_KEY = config.google_key - print " google query..." + print " Google query..." search_request_retry = config.copyright_connection_tries while search_request_retry: try: @@ -543,7 +548,7 @@ search_request_retry -= 1 if config.copyright_yahoo: import yahoo.search.web - print " yahoo query..." + print " Yahoo query..." data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"' + query.encode('utf_8') + '" -Wikipedia', results=numresults) @@ -588,7 +593,7 @@
offset = 0 for i in range(len(url)): - if check_list(url[i + offset][0], excl_list, debug = True): + if check_list(url[i + offset][0], excl_list, verbose = True): url.pop(i + offset) offset += -1 return url