Revision: 4092
Author: cosoleto
Date: 2007-08-23 08:30:49 +0000 (Thu, 23 Aug 2007)
Log Message:
-----------
economize_query() to skip sequence of numbers or wide comma separated lists.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2007-08-22 21:52:43 UTC (rev 4091)
+++ trunk/pywikipedia/config.py 2007-08-23 08:30:49 UTC (rev 4092)
@@ -344,6 +344,14 @@
# Append length of URL to script result
copyright_show_length = True
+# By default the script try to identify and skip text that contents a wide
+# comma separated list or only numbers. But sometimes that might be the
+# only part unmodified of a slightly edited and not otherwise reported
+# copyright violation. You can disable this feature to try to increase
+# accuracy.
+
+copyright_economize_query = True
+
############## FURTHER SETTINGS ##############
# The bot can make some additional changes to each page it edits, e.g. fix
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-08-22 21:52:43 UTC (rev 4091)
+++ trunk/pywikipedia/copyright.py 2007-08-23 08:30:49 UTC (rev 4092)
@@ -75,10 +75,15 @@
__version__='$Id$'
-# Try to skip quoted text
+# Try to skip quoted text.
exclude_quote = True
-# No checks if the page is a disambiguation page
+# If ratio between query length and number of commas is greater or equal
+# to 'comma_ratio' then the script identify a comma separated list and
+# don't send data to search engine.
+comma_ratio = 5
+
+# No checks if the page is a disambiguation page.
skip_disambig = True
appdir = "copyright/"
@@ -323,6 +328,27 @@
f.close()
#
+# Ignore text that contents comma separated list, only numbers,
+# punctuation...
+
+def economize_query(text)
+ # Comma separated list
+ if text.count(', ') > 4:
+ l = len(text)
+ c = text.count(', ')
+ r = 100 * c / l
+
+ if r >= comma_ratio
+ return True
+
+ # write_log("%d/%d/%d: %s\n" % (l,c,r,text), "copyright/skip"
+ str(r) + ".txt")
+
+ # Numbers
+ if re.search('[^0-9\'*/,. +?:;-]{5}', text):
+ return False
+ return True
+
+#
# Set regex used in cleanwikicode() to remove [[Image:]] tags
# and regex used in check_in_source() to reject pages with
# 'Wikipedia'.
@@ -442,6 +468,11 @@
line = cleanwikicode(line)
for search_words in mysplit(line, 31, " "):
if len(search_words) > 120:
+ if config.copyright_economize_query:
+ if economize_query(search_words):
+ wikipedia.output('SKIP TEXT: ' + search_words)
+ consecutive = False
+ continue
n_query += 1
#wikipedia.output(search_words)
if config.copyright_max_query_for_page and n_query >
config.copyright_max_query_for_page:
Show replies by date