Revision: 4126
Author: cosoleto
Date: 2007-08-29 06:31:34 +0000 (Wed, 29 Aug 2007)
Log Message:
-----------
Fixed a bug in check_in_source() related to pdf files. Added support to exclude web pages that contents '[edit]' tag (please check my poor English). Restored -repeat parameter, added -text so users can input text to check from command line.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2007-08-28 19:56:41 UTC (rev 4125)
+++ trunk/pywikipedia/config.py 2007-08-29 06:31:34 UTC (rev 4126)
@@ -317,6 +317,13 @@
copyright_check_in_source_yahoo = False
copyright_check_in_source_msn = False
+# Web pages may content a Wikipedia text without 'Wikipedia' word but with
+# typical '[edit]' tag result of copy & paste procedure. You can want no
+# report for this kind of URLs, even if they are copyright violation.
+# However, when enabled these URLs are logged in a file.
+
+copyright_check_in_source_section_names = False
+
# Limit number of queries for page.
copyright_max_query_for_page = 25
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-08-28 19:56:41 UTC (rev 4125)
+++ trunk/pywikipedia/copyright.py 2007-08-29 06:31:34 UTC (rev 4126)
@@ -195,6 +195,14 @@
'zh-yue': u'維基百科',
}
+editsection_names = {
+ 'en': u'\[edit\]',
+ 'fr': u'\[modifier\]',
+ 'de': u'\[Bearbeiten\]',
+ 'es,pt': u'\[editar\]',
+ 'it': u'\[modifica\]',
+}
+
sections_to_skip = {
'en':['References', 'Further reading', 'Citations', 'External links'],
'it':['Bibliografia', 'Riferimenti bibliografici', 'Collegamenti esterni', 'Pubblicazioni principali'],
@@ -364,6 +372,7 @@
reImageC = re.compile('\[\[' + join_family_data('Image', 6) + ':.*?\]\]', re.I)
reWikipediaC = re.compile('(' + '|'.join(wikipedia_names.values()) + ')', re.I)
+reSectionNamesC = re.compile('(' + '|'.join(editsection_names.values()) + ')')
def cleanwikicode(text):
if not text:
@@ -545,9 +554,11 @@
if err.code >= 400:
raise NoWebPage
return None
- #except urllib2.URLError:
+ except urllib2.URLError, arg:
+ print "URL error: %s / %s" % (url, arg)
+ return None
except Exception, err:
- print "ERROR: %s" % (err)
+ print "ERROR: %s" % (err)
self._lastmodified = self._urldata.info().getdate('Last-Modified')
self._length = self._urldata.info().getheader('Content-Length')
@@ -580,16 +591,26 @@
# Make sure we did try to get the contents once
if not hasattr(self, '_contents'):
self._contents = self._urldata.read()
- return self._contents
- return None
+ return self._contents
+
+ def check_regexp(self, reC, text, filename = None):
+ m = reC.search(text)
+ if m:
+ global excl_list, positive_source_seen
+ excl_list += [self._url]
+ positive_source_seen.add(self._url)
+ if filename:
+ write_log("%s (%s)\n" % (self._url, m.group()), filename)
+ return True
+
def check_in_source(self):
"""
Sources may be different from search engine database and include mentions of
Wikipedia. This function avoid also errors in search results that can occurs
either with Google and Yahoo! service.
"""
- global excl_list, source_seen, positive_source_seen
+ global source_seen
if not hasattr(self, '_urldata'):
return False
@@ -600,7 +621,10 @@
if self._url in source_seen:
return False
- text = self.get()
+ try:
+ text = self.get()
+ except URL_exclusion:
+ return False
# Character encoding conversion if 'Content-Type' field has
# charset attribute set to UTF-8.
@@ -613,14 +637,13 @@
if 'text/html' in self._content_type and (re.search("(?is)<meta\s.*?charset\s*=\s*[\"\']*\s*UTF-8.*?>", text) or re.search("(?is)<\?.*?encoding\s*=\s*[\"\']*\s*UTF-8.*?\?>", text)):
text = text.decode("utf-8", 'replace')
- m = reWikipediaC.search(text)
- if m:
- excl_list += [self._url]
- write_log("%s (%s)\n" % (self._url, m.group()), "copyright/sites_with_'wikipedia'.txt")
- positive_source_seen.add(self._url)
+ if config.copyright_check_in_source_section_names:
+ if self.check_regexp(reSectionNamesC, text, "copyright/sites_with_'[edit]'.txt"):
+ return True
+
+ if self.check_regexp(reWikipediaC, text, "copyright/sites_with_'wikipedia'.txt"):
return True
- else:
- write_log(self._url + '\n', "copyright/sites_without_'wikipedia'.txt")
+
source_seen.add(self._url)
return False
@@ -862,7 +885,9 @@
# default to [] which means all namespaces will be processed
namespaces = []
#
- #repeat = False
+ repeat = False
+ #
+ text = None
firstPageTitle = None
# This factory is responsible for processing command line arguments
@@ -873,12 +898,10 @@
config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid, "Yahoo AppID")
config.copyright_google = check_config(config.copyright_google, config.google_key, "Google Web API license key")
- config.copyright_msn = check_config(config.copyright_msn, config.msn_appid, "Live Search AppID")
+ config.copyright_msn = check_config(config.copyright_msn, config.msn_appid, "Live Search AppID")
# Read commandline parameters.
for arg in wikipedia.handleArgs():
- #if arg.startswith('-repeat'):
- # repeat = True
if arg == '-y':
config.copyright_yahoo = True
elif arg == '-g':
@@ -900,6 +923,9 @@
elif arg.startswith('-skipquery'):
if len(arg) >= 11:
config.copyright_skip_query = int(arg[11:])
+ elif arg.startswith('-text'):
+ if len(arg) >= 6:
+ text = arg[6:]
elif arg.startswith('-xml'):
if len(arg) == 4:
xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
@@ -914,6 +940,13 @@
namespaces.append(int(arg[11:]))
elif arg.startswith('-forceupdate'):
load_pages(force_update = True)
+ elif arg == '-repeat':
+ repeat = True
+ elif arg.startswith('-new'):
+ if len(arg) >=5:
+ gen = pagegenerators.NewpagesPageGenerator(number=int(arg[5:]), repeat = repeat)
+ else:
+ gen = pagegenerators.NewpagesPageGenerator(number=60, repeat = repeat)
else:
generator = genFactory.handleArg(arg)
if generator:
@@ -926,9 +959,15 @@
if ids:
checks_by_ids(ids)
- if not gen and not ids:
+ if not gen and not ids and not text:
# syntax error, show help text from the top of this file
wikipedia.output(__doc__, 'utf-8')
+
+ if text:
+ output = query(lines = text.splitlines())
+ if output:
+ wikipedia.output(output)
+
if not gen:
wikipedia.stopme()
sys.exit()