Revision: 4092 Author: cosoleto Date: 2007-08-23 08:30:49 +0000 (Thu, 23 Aug 2007)
Log Message: ----------- economize_query() to skip sequence of numbers or wide comma separated lists.
Modified Paths: -------------- trunk/pywikipedia/config.py trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2007-08-22 21:52:43 UTC (rev 4091) +++ trunk/pywikipedia/config.py 2007-08-23 08:30:49 UTC (rev 4092) @@ -344,6 +344,14 @@ # Append length of URL to script result copyright_show_length = True
+# By default the script try to identify and skip text that contents a wide +# comma separated list or only numbers. But sometimes that might be the +# only part unmodified of a slightly edited and not otherwise reported +# copyright violation. You can disable this feature to try to increase +# accuracy. + +copyright_economize_query = True + ############## FURTHER SETTINGS ##############
# The bot can make some additional changes to each page it edits, e.g. fix
Modified: trunk/pywikipedia/copyright.py =================================================================== --- trunk/pywikipedia/copyright.py 2007-08-22 21:52:43 UTC (rev 4091) +++ trunk/pywikipedia/copyright.py 2007-08-23 08:30:49 UTC (rev 4092) @@ -75,10 +75,15 @@
__version__='$Id$'
-# Try to skip quoted text +# Try to skip quoted text. exclude_quote = True
-# No checks if the page is a disambiguation page +# If ratio between query length and number of commas is greater or equal +# to 'comma_ratio' then the script identify a comma separated list and +# don't send data to search engine. +comma_ratio = 5 + +# No checks if the page is a disambiguation page. skip_disambig = True
appdir = "copyright/" @@ -323,6 +328,27 @@ f.close()
# +# Ignore text that contents comma separated list, only numbers, +# punctuation... + +def economize_query(text) + # Comma separated list + if text.count(', ') > 4: + l = len(text) + c = text.count(', ') + r = 100 * c / l + + if r >= comma_ratio + return True + + # write_log("%d/%d/%d: %s\n" % (l,c,r,text), "copyright/skip" + str(r) + ".txt") + + # Numbers + if re.search('[^0-9'*/,. +?:;-]{5}', text): + return False + return True + +# # Set regex used in cleanwikicode() to remove [[Image:]] tags # and regex used in check_in_source() to reject pages with # 'Wikipedia'. @@ -442,6 +468,11 @@ line = cleanwikicode(line) for search_words in mysplit(line, 31, " "): if len(search_words) > 120: + if config.copyright_economize_query: + if economize_query(search_words): + wikipedia.output('SKIP TEXT: ' + search_words) + consecutive = False + continue n_query += 1 #wikipedia.output(search_words) if config.copyright_max_query_for_page and n_query > config.copyright_max_query_for_page: