[Pywikipedia-l] SVN: [4092] trunk/pywikipedia - pywikibot

23 Aug 2007

Revision: 4092
Author:   cosoleto
Date:     2007-08-23 08:30:49 +0000 (Thu, 23 Aug 2007)

Log Message:
-----------
economize_query() to skip sequence of numbers or wide comma separated lists.

Modified Paths:
--------------
    trunk/pywikipedia/config.py
    trunk/pywikipedia/copyright.py

Modified: trunk/pywikipedia/config.py
===================================================================

--- trunk/pywikipedia/config.py	2007-08-22 21:52:43 UTC (rev 4091)
+++ trunk/pywikipedia/config.py	2007-08-23 08:30:49 UTC (rev 4092)
@@ -344,6 +344,14 @@
 # Append length of URL to script result
 copyright_show_length = True
 
+# By default the script try to identify and skip text that contents a wide
+# comma separated list or only numbers. But sometimes that might be the
+# only part unmodified of a slightly edited and not otherwise reported
+# copyright violation. You can disable this feature to try to increase
+# accuracy.
+
+copyright_economize_query = True
+
 ############## FURTHER SETTINGS ##############
 
 # The bot can make some additional changes to each page it edits, e.g. fix

Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py	2007-08-22 21:52:43 UTC (rev 4091)
+++ trunk/pywikipedia/copyright.py	2007-08-23 08:30:49 UTC (rev 4092)
@@ -75,10 +75,15 @@
 
 __version__='$Id$'
 
-# Try to skip quoted text
+# Try to skip quoted text.
 exclude_quote = True
 
-# No checks if the page is a disambiguation page
+# If ratio between query length and number of commas is greater or equal
+# to 'comma_ratio' then the script identify a comma separated list and
+# don't send data to search engine.
+comma_ratio = 5
+
+# No checks if the page is a disambiguation page.
 skip_disambig = True
 
 appdir = "copyright/"
@@ -323,6 +328,27 @@
     f.close()
 
 #
+# Ignore text that contents comma separated list, only numbers,
+# punctuation...
+
+def economize_query(text)
+    # Comma separated list
+    if text.count(', ') > 4:
+        l = len(text)
+        c = text.count(', ')
+        r = 100 * c / l
+
+        if r >= comma_ratio
+            return True
+
+        # write_log("%d/%d/%d: %s\n" % (l,c,r,text), "copyright/skip"
+ str(r) + ".txt")
+
+    # Numbers
+    if re.search('[^0-9\'*/,. +?:;-]{5}', text):
+        return False
+    return True
+
+#
 # Set regex used in cleanwikicode() to remove [[Image:]] tags
 # and regex used in check_in_source() to reject pages with
 # 'Wikipedia'.
@@ -442,6 +468,11 @@
         line = cleanwikicode(line)
         for search_words in mysplit(line, 31, " "):
             if len(search_words) > 120:
+                if config.copyright_economize_query:
+                    if economize_query(search_words):
+                        wikipedia.output('SKIP TEXT: ' + search_words)
+                        consecutive = False
+                        continue
                 n_query += 1
                 #wikipedia.output(search_words)
                 if config.copyright_max_query_for_page and n_query >
config.copyright_max_query_for_page: