Revision: 4130
Author: btongminh
Date: 2007-08-29 15:14:32 +0000 (Wed, 29 Aug 2007)
Log Message:
-----------
Reverting wrong commit.
Modified Paths:
--------------
trunk/pywikipedia/checkusage.py
trunk/pywikipedia/delinker.py
trunk/pywikipedia/image_replacer.py
Modified: trunk/pywikipedia/checkusage.py
===================================================================
--- trunk/pywikipedia/checkusage.py 2007-08-29 15:09:47 UTC (rev 4129)
+++ trunk/pywikipedia/checkusage.py 2007-08-29 15:14:32 UTC (rev 4130)
@@ -3,7 +3,7 @@
"""
This module provides a way for users of the Wikimedia toolserver to check the
use of images from Commons on other Wikimedia wikis. It supports both running
-checkusage against the database and but also against the live wikis. It is very
+checkusage against the database and against the live wikis. It is very
efficient as it only creates one HTTP connection and one MySQL connection
during its life time. It is not suitable for multithreading!
@@ -12,6 +12,23 @@
to connect to the MySQL database. The top wikis in size will be checked. The
class provides multiple methods:
+get_usage(image)
+This method will return a generator object that generates the usage of the
+image, returned as the following tuple: (page_namespace, page_title,
+full_title). page_namespace is the numeric namespace, page_title the page title
+without namespace, full_title the page title including localized namespace.
+
+get_usage_db(dbname, image), get_usage_live(domain, image)
+Those methods allow querying a specific wiki, respectively against the database
+and against the live wiki. They accept respectively the database name and the
+domain name. The return a generator which generates the same results as
+get_usage().
+
+get_usage_multi(images)
+Calls get_usage for each image and returns a dictionary with usages.
+
+get_replag(dbname)
+Returns the time in seconds since the latest known edit of dbname.
"""
#
# (C) Bryan Tong Minh, 2007
Modified: trunk/pywikipedia/delinker.py
===================================================================
--- trunk/pywikipedia/delinker.py 2007-08-29 15:09:47 UTC (rev 4129)
+++ trunk/pywikipedia/delinker.py 2007-08-29 15:14:32 UTC (rev 4130)
@@ -2,11 +2,12 @@
# -*- coding: utf-8 -*-
"""
This script keeps track of image deletions and delinks removed files
-from (any) wiki. Usage on protected pages or pages containing blacklisted
-external links cannot be processed.
+from (any) wiki. Usage
+on protected pages or pages containing blacklisted external links cannot
+be processed.
This script is run by [[commons:User:Siebrand]] on the toolserver. It should
-not be run on Commons by other users without prior contact.
+not be run by other users without prior contact.
Although the classes are called CommonsDelinker and Delinker, it is in fact
a general delinker/replacer, also suitable for local use.
@@ -28,7 +29,6 @@
# TODO:
# * Don't replace within <nowiki /> tags
# * Make as many config settings site dependend
-# * Implement sqlite3 mode
# BUGS:
# * There is a problem with images in the es.wikisource project namespace.
# The exact problem is described somewhere in Bryan's IRC logs, but it is
@@ -69,10 +69,7 @@
kwargs['callback'] = wait_callback
return mysql_autoconnection.connect(**kwargs)
- elif engine == 'sqlite3':
- import sqlite3
- return sqlite3.connect(**kwargs)
-
+ # TODO: Add support for sqlite3
raise RuntimeError('Unsupported database engine %s' % engine)
class Delinker(threadpool.Thread):
@@ -309,10 +306,6 @@
self.CommonsDelinker.config['summary_cache']:
# Return cached result
return self.summaries[type][key][0]
- else:
- self.summaries[type][key] = \
- (self.CommonsDelinker.config['default_settings'].get(type, ''),
- time.time())
output(u'%s Fetching new summary for %s' % (self, site))
@@ -343,10 +336,9 @@
'wikisource', 'wikinews', 'wikiversity'):
newsite = self.CommonsDelinker.get_site(site.lang,
wikipedia.Family('wikipedia'))
- return self.get(newsite, type, key)
+ return self.get(newsite, type, key = key)
+ return self.CommonsDelinker.config['default_settings'].get(type, '')
- return self.get(site, type, key)
-
def check_user_page(self, site):
"Check whether a userpage exists. Only used for CommonsDelinker."
try:
@@ -361,8 +353,7 @@
ftxt = f.read()
f.close()
if not '#' + str(site) in ftxt:
- # BUG: Username does not exist
- username = config.usernames[site.family.name][site.lang]
+ username = config.usernames[site.family.name][site.lang]
userpage = wikipedia.Page(site, 'User:' + username)
# Removed check for page existence. If it is not in our
@@ -409,6 +400,10 @@
""" Check whether this image needs to be delinked. """
# Check whether the image still is deleted on Commons.
+ # BUG: This also returns true for images with a page, but
+ # without the image itself. Can be fixed by querying query.php
+ # instead of api.php. Also should this be made as an exits()
+ # method of checkusage.CheckUsage?
if self.site.shared_image_repository() != (None, None):
shared_image_repository = self.CommonsDelinker.get_site(*self.site.shared_image_repository())
try:
@@ -533,9 +528,6 @@
self.exit()
self.CommonsDelinker.thread_died()
- def format_query(self, query):
-
-
class CommonsDelinker(object):
def __init__(self):
self.config = config.CommonsDelinker
Modified: trunk/pywikipedia/image_replacer.py
===================================================================
--- trunk/pywikipedia/image_replacer.py 2007-08-29 15:09:47 UTC (rev 4129)
+++ trunk/pywikipedia/image_replacer.py 2007-08-29 15:14:32 UTC (rev 4130)
@@ -241,6 +241,4 @@
if type(e) not in (SystemExit, KeyboardInterrupt):
output('A critical error has occured! Aborting!')
print >>sys.stderr, cgitb.text(sys.exc_info())
- r.reporters.exit()
- output(u'Stopping ImageReplacer')
wikipedia.stopme()
\ No newline at end of file
Revision: 4129
Author: btongminh
Date: 2007-08-29 15:09:47 +0000 (Wed, 29 Aug 2007)
Log Message:
-----------
Fix bad title bug.
Modified Paths:
--------------
trunk/pywikipedia/checkusage.py
trunk/pywikipedia/delinker.py
trunk/pywikipedia/image_replacer.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/checkusage.py
===================================================================
--- trunk/pywikipedia/checkusage.py 2007-08-29 13:06:56 UTC (rev 4128)
+++ trunk/pywikipedia/checkusage.py 2007-08-29 15:09:47 UTC (rev 4129)
@@ -3,7 +3,7 @@
"""
This module provides a way for users of the Wikimedia toolserver to check the
use of images from Commons on other Wikimedia wikis. It supports both running
-checkusage against the database and against the live wikis. It is very
+checkusage against the database and but also against the live wikis. It is very
efficient as it only creates one HTTP connection and one MySQL connection
during its life time. It is not suitable for multithreading!
@@ -12,23 +12,6 @@
to connect to the MySQL database. The top wikis in size will be checked. The
class provides multiple methods:
-get_usage(image)
-This method will return a generator object that generates the usage of the
-image, returned as the following tuple: (page_namespace, page_title,
-full_title). page_namespace is the numeric namespace, page_title the page title
-without namespace, full_title the page title including localized namespace.
-
-get_usage_db(dbname, image), get_usage_live(domain, image)
-Those methods allow querying a specific wiki, respectively against the database
-and against the live wiki. They accept respectively the database name and the
-domain name. The return a generator which generates the same results as
-get_usage().
-
-get_usage_multi(images)
-Calls get_usage for each image and returns a dictionary with usages.
-
-get_replag(dbname)
-Returns the time in seconds since the latest known edit of dbname.
"""
#
# (C) Bryan Tong Minh, 2007
Modified: trunk/pywikipedia/delinker.py
===================================================================
--- trunk/pywikipedia/delinker.py 2007-08-29 13:06:56 UTC (rev 4128)
+++ trunk/pywikipedia/delinker.py 2007-08-29 15:09:47 UTC (rev 4129)
@@ -2,12 +2,11 @@
# -*- coding: utf-8 -*-
"""
This script keeps track of image deletions and delinks removed files
-from (any) wiki. Usage
-on protected pages or pages containing blacklisted external links cannot
-be processed.
+from (any) wiki. Usage on protected pages or pages containing blacklisted
+external links cannot be processed.
This script is run by [[commons:User:Siebrand]] on the toolserver. It should
-not be run by other users without prior contact.
+not be run on Commons by other users without prior contact.
Although the classes are called CommonsDelinker and Delinker, it is in fact
a general delinker/replacer, also suitable for local use.
@@ -29,6 +28,7 @@
# TODO:
# * Don't replace within <nowiki /> tags
# * Make as many config settings site dependend
+# * Implement sqlite3 mode
# BUGS:
# * There is a problem with images in the es.wikisource project namespace.
# The exact problem is described somewhere in Bryan's IRC logs, but it is
@@ -69,7 +69,10 @@
kwargs['callback'] = wait_callback
return mysql_autoconnection.connect(**kwargs)
- # TODO: Add support for sqlite3
+ elif engine == 'sqlite3':
+ import sqlite3
+ return sqlite3.connect(**kwargs)
+
raise RuntimeError('Unsupported database engine %s' % engine)
class Delinker(threadpool.Thread):
@@ -306,6 +309,10 @@
self.CommonsDelinker.config['summary_cache']:
# Return cached result
return self.summaries[type][key][0]
+ else:
+ self.summaries[type][key] = \
+ (self.CommonsDelinker.config['default_settings'].get(type, ''),
+ time.time())
output(u'%s Fetching new summary for %s' % (self, site))
@@ -336,9 +343,10 @@
'wikisource', 'wikinews', 'wikiversity'):
newsite = self.CommonsDelinker.get_site(site.lang,
wikipedia.Family('wikipedia'))
- return self.get(newsite, type, key = key)
- return self.CommonsDelinker.config['default_settings'].get(type, '')
+ return self.get(newsite, type, key)
+ return self.get(site, type, key)
+
def check_user_page(self, site):
"Check whether a userpage exists. Only used for CommonsDelinker."
try:
@@ -353,7 +361,8 @@
ftxt = f.read()
f.close()
if not '#' + str(site) in ftxt:
- username = config.usernames[site.family.name][site.lang]
+ # BUG: Username does not exist
+ username = config.usernames[site.family.name][site.lang]
userpage = wikipedia.Page(site, 'User:' + username)
# Removed check for page existence. If it is not in our
@@ -400,10 +409,6 @@
""" Check whether this image needs to be delinked. """
# Check whether the image still is deleted on Commons.
- # BUG: This also returns true for images with a page, but
- # without the image itself. Can be fixed by querying query.php
- # instead of api.php. Also should this be made as an exits()
- # method of checkusage.CheckUsage?
if self.site.shared_image_repository() != (None, None):
shared_image_repository = self.CommonsDelinker.get_site(*self.site.shared_image_repository())
try:
@@ -528,6 +533,9 @@
self.exit()
self.CommonsDelinker.thread_died()
+ def format_query(self, query):
+
+
class CommonsDelinker(object):
def __init__(self):
self.config = config.CommonsDelinker
Modified: trunk/pywikipedia/image_replacer.py
===================================================================
--- trunk/pywikipedia/image_replacer.py 2007-08-29 13:06:56 UTC (rev 4128)
+++ trunk/pywikipedia/image_replacer.py 2007-08-29 15:09:47 UTC (rev 4129)
@@ -241,4 +241,6 @@
if type(e) not in (SystemExit, KeyboardInterrupt):
output('A critical error has occured! Aborting!')
print >>sys.stderr, cgitb.text(sys.exc_info())
+ r.reporters.exit()
+ output(u'Stopping ImageReplacer')
wikipedia.stopme()
\ No newline at end of file
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-08-29 13:06:56 UTC (rev 4128)
+++ trunk/pywikipedia/wikipedia.py 2007-08-29 15:09:47 UTC (rev 4129)
@@ -608,7 +608,7 @@
# older MediaWiki versions don't have the 'nocreatetitle' message.
elif self.site().has_mediawiki_message('nocreatetitle') and text.find(self.site().mediawiki_message('nocreatetitle')) != -1:
raise LockedNoPage(u'%s does not exist, and page creation is forbidden for anonymous users.' % self.aslink())
- elif text.find('var wgPageName = "Special:Badtitle";'):
+ elif text.find('var wgPageName = "Special:Badtitle";') != -1:
raise BadTitle('BadTitle: %s' % self)
else:
output( unicode(text) )
Revision: 4126
Author: cosoleto
Date: 2007-08-29 06:31:34 +0000 (Wed, 29 Aug 2007)
Log Message:
-----------
Fixed a bug in check_in_source() related to pdf files. Added support to exclude web pages that contents '[edit]' tag (please check my poor English). Restored -repeat parameter, added -text so users can input text to check from command line.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2007-08-28 19:56:41 UTC (rev 4125)
+++ trunk/pywikipedia/config.py 2007-08-29 06:31:34 UTC (rev 4126)
@@ -317,6 +317,13 @@
copyright_check_in_source_yahoo = False
copyright_check_in_source_msn = False
+# Web pages may content a Wikipedia text without 'Wikipedia' word but with
+# typical '[edit]' tag result of copy & paste procedure. You can want no
+# report for this kind of URLs, even if they are copyright violation.
+# However, when enabled these URLs are logged in a file.
+
+copyright_check_in_source_section_names = False
+
# Limit number of queries for page.
copyright_max_query_for_page = 25
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-08-28 19:56:41 UTC (rev 4125)
+++ trunk/pywikipedia/copyright.py 2007-08-29 06:31:34 UTC (rev 4126)
@@ -195,6 +195,14 @@
'zh-yue': u'維基百科',
}
+editsection_names = {
+ 'en': u'\[edit\]',
+ 'fr': u'\[modifier\]',
+ 'de': u'\[Bearbeiten\]',
+ 'es,pt': u'\[editar\]',
+ 'it': u'\[modifica\]',
+}
+
sections_to_skip = {
'en':['References', 'Further reading', 'Citations', 'External links'],
'it':['Bibliografia', 'Riferimenti bibliografici', 'Collegamenti esterni', 'Pubblicazioni principali'],
@@ -364,6 +372,7 @@
reImageC = re.compile('\[\[' + join_family_data('Image', 6) + ':.*?\]\]', re.I)
reWikipediaC = re.compile('(' + '|'.join(wikipedia_names.values()) + ')', re.I)
+reSectionNamesC = re.compile('(' + '|'.join(editsection_names.values()) + ')')
def cleanwikicode(text):
if not text:
@@ -545,9 +554,11 @@
if err.code >= 400:
raise NoWebPage
return None
- #except urllib2.URLError:
+ except urllib2.URLError, arg:
+ print "URL error: %s / %s" % (url, arg)
+ return None
except Exception, err:
- print "ERROR: %s" % (err)
+ print "ERROR: %s" % (err)
self._lastmodified = self._urldata.info().getdate('Last-Modified')
self._length = self._urldata.info().getheader('Content-Length')
@@ -580,16 +591,26 @@
# Make sure we did try to get the contents once
if not hasattr(self, '_contents'):
self._contents = self._urldata.read()
- return self._contents
- return None
+ return self._contents
+
+ def check_regexp(self, reC, text, filename = None):
+ m = reC.search(text)
+ if m:
+ global excl_list, positive_source_seen
+ excl_list += [self._url]
+ positive_source_seen.add(self._url)
+ if filename:
+ write_log("%s (%s)\n" % (self._url, m.group()), filename)
+ return True
+
def check_in_source(self):
"""
Sources may be different from search engine database and include mentions of
Wikipedia. This function avoid also errors in search results that can occurs
either with Google and Yahoo! service.
"""
- global excl_list, source_seen, positive_source_seen
+ global source_seen
if not hasattr(self, '_urldata'):
return False
@@ -600,7 +621,10 @@
if self._url in source_seen:
return False
- text = self.get()
+ try:
+ text = self.get()
+ except URL_exclusion:
+ return False
# Character encoding conversion if 'Content-Type' field has
# charset attribute set to UTF-8.
@@ -613,14 +637,13 @@
if 'text/html' in self._content_type and (re.search("(?is)<meta\s.*?charset\s*=\s*[\"\']*\s*UTF-8.*?>", text) or re.search("(?is)<\?.*?encoding\s*=\s*[\"\']*\s*UTF-8.*?\?>", text)):
text = text.decode("utf-8", 'replace')
- m = reWikipediaC.search(text)
- if m:
- excl_list += [self._url]
- write_log("%s (%s)\n" % (self._url, m.group()), "copyright/sites_with_'wikipedia'.txt")
- positive_source_seen.add(self._url)
+ if config.copyright_check_in_source_section_names:
+ if self.check_regexp(reSectionNamesC, text, "copyright/sites_with_'[edit]'.txt"):
+ return True
+
+ if self.check_regexp(reWikipediaC, text, "copyright/sites_with_'wikipedia'.txt"):
return True
- else:
- write_log(self._url + '\n', "copyright/sites_without_'wikipedia'.txt")
+
source_seen.add(self._url)
return False
@@ -862,7 +885,9 @@
# default to [] which means all namespaces will be processed
namespaces = []
#
- #repeat = False
+ repeat = False
+ #
+ text = None
firstPageTitle = None
# This factory is responsible for processing command line arguments
@@ -873,12 +898,10 @@
config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid, "Yahoo AppID")
config.copyright_google = check_config(config.copyright_google, config.google_key, "Google Web API license key")
- config.copyright_msn = check_config(config.copyright_msn, config.msn_appid, "Live Search AppID")
+ config.copyright_msn = check_config(config.copyright_msn, config.msn_appid, "Live Search AppID")
# Read commandline parameters.
for arg in wikipedia.handleArgs():
- #if arg.startswith('-repeat'):
- # repeat = True
if arg == '-y':
config.copyright_yahoo = True
elif arg == '-g':
@@ -900,6 +923,9 @@
elif arg.startswith('-skipquery'):
if len(arg) >= 11:
config.copyright_skip_query = int(arg[11:])
+ elif arg.startswith('-text'):
+ if len(arg) >= 6:
+ text = arg[6:]
elif arg.startswith('-xml'):
if len(arg) == 4:
xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
@@ -914,6 +940,13 @@
namespaces.append(int(arg[11:]))
elif arg.startswith('-forceupdate'):
load_pages(force_update = True)
+ elif arg == '-repeat':
+ repeat = True
+ elif arg.startswith('-new'):
+ if len(arg) >=5:
+ gen = pagegenerators.NewpagesPageGenerator(number=int(arg[5:]), repeat = repeat)
+ else:
+ gen = pagegenerators.NewpagesPageGenerator(number=60, repeat = repeat)
else:
generator = genFactory.handleArg(arg)
if generator:
@@ -926,9 +959,15 @@
if ids:
checks_by_ids(ids)
- if not gen and not ids:
+ if not gen and not ids and not text:
# syntax error, show help text from the top of this file
wikipedia.output(__doc__, 'utf-8')
+
+ if text:
+ output = query(lines = text.splitlines())
+ if output:
+ wikipedia.output(output)
+
if not gen:
wikipedia.stopme()
sys.exit()
Bugs item #1783572, was opened at 2007-08-28 22:11
Message generated for change (Tracker Item Submitted) made by Item Submitter
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1783572&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: other
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Persian Gulf (persian_gulf)
Assigned to: Nobody/Anonymous (nobody)
Summary: Necessary translation for redirect in wikipedia_family.py
Initial Comment:
This translation should exist in wikipedia_family.py to make $redirect * double work.
The file with added translation is included.
added translation:
self.redirect{
'fa' : u'تغییرمسیر',
}
This works well after the following bug is solved:
[ pywikipediabot-Bugs-1783561 ] a regex bug on line
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1783572&group_…
Bugs item #1783491, was opened at 2007-08-28 17:46
Message generated for change (Comment added) made by btongminh
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1783491&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: interwiki
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Multichill (multichill)
Assigned to: Nobody/Anonymous (nobody)
Summary: Interwiki sysop crash
Initial Comment:
Updating links on page [[lv:Kategorija:Krievijas ezeri]].
Changes to be made: pievieno: [[da:Kategori:Søer i Rusland]]
+ [[da:Kategori:Søer i Rusland]]
NOTE: Updating live wiki...
Dump nl (wikipedia) saved
Traceback (most recent call last):
File "interwiki.py", line 1542, in ?
bot.run()
File "interwiki.py", line 1325, in run
self.queryStep()
File "interwiki.py", line 1304, in queryStep
subj.finish(self)
File "interwiki.py", line 924, in finish
if self.replaceLinks(page, new, bot):
File "interwiki.py", line 1051, in replaceLinks
status, reason, data = page.put(newtext, comment = wikipedia.translate(page.site().lang, msg)[0] + mods)
File "/home/bot/pywikipedia/wikipedia.py", line 1050, in put
if not self.botMayEdit():
File "/home/bot/pywikipedia/wikipedia.py", line 769, in botMayEdit
self.site().forceLogin(sysop=True)
File "/home/bot/pywikipedia/wikipedia.py", line 3188, in forceLogin
if not self.loggedInAs(sysop = sysop):
File "/home/bot/pywikipedia/wikipedia.py", line 3202, in loggedInAs
self._loadCookies(sysop = sysop)
File "/home/bot/pywikipedia/wikipedia.py", line 3239, in _loadCookies
raise NoUsername('You tried to perform an action that requires admin privileges, but you haven\'t entered your sysop name in your user-config.py. Please add sysopnames[\'%s\'][\'%s\']=\'name\' to your user-config.py' % (self.family.name, self.lang))
wikipedia.NoUsername: You tried to perform an action that requires admin privileges, but you haven't entered your sysop name in your user-config.py. Please add sysopnames['wikipedia']['lv']='name' to your user-config.py
----------------------------------------------------------------------
Comment By: Bryan (btongminh)
Date: 2007-08-28 20:23
Message:
Logged In: YES
user_id=1806226
Originator: NO
Fixed in r4124.
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1783491&group_…
Bugs item #1783561, was opened at 2007-08-28 21:48
Message generated for change (Tracker Item Submitted) made by Item Submitter
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1783561&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: General
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Persian Gulf (persian_gulf)
Assigned to: Nobody/Anonymous (nobody)
Summary: a regex bug on line 3819 wikipedia.py
Initial Comment:
the following line in line 3819 wikipedia.py
redirKeywords = [u'redirect'] + self.family.redirect[self.lang]
should be changed to :
redirKeywords = [u'redirect'] + [self.family.redirect[self.lang]]
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1783561&group_…