Revision: 4088
Author: cosoleto
Date: 2007-08-22 18:03:28 +0000 (Wed, 22 Aug 2007)
Log Message:
-----------
Skip disambiguation page, minor fixes.
Modified Paths:
--------------
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-08-22 14:40:30 UTC (rev 4087)
+++ trunk/pywikipedia/copyright.py 2007-08-22 18:03:28 UTC (rev 4088)
@@ -20,14 +20,18 @@
-ng - Do not use Google
-y - Use Yahoo! search engine
-ny - Do not use Yahoo!
+-l - Use Windows Live Search engine
+-nl - Do not use Windows Live Search
-maxquery - Stop after a specified number of queries for page (default: 25)
-skipquery - Skip a number specified of queries
--new -
-output - Append results to a specified file (default:
'copyright/output.txt')
+
-file - Work on all pages given in a local text file.
Will read any [[wiki link]] and use these articles.
Argument can also be given as "-file:filename".
+-new - Work on the 60 newest pages. If given as -new:x, will work
+ on the x newest pages.
-cat - Work on all pages which are in a specific category.
Argument can also be given as "-cat:categoryname".
-subcat - When the pages to work on have been chosen by -cat, pages in
@@ -66,13 +70,17 @@
#
from __future__ import generators
-import sys, re, codecs, os, time
+import sys, re, codecs, os, time, urllib2, httplib
import wikipedia, pagegenerators, catlib, config
__version__='$Id$'
+# Try to skip quoted text
exclude_quote = True
+# No checks if the page is a disambiguation page
+skip_disambig = True
+
appdir = "copyright/"
output_file = appdir + "output.txt"
@@ -265,7 +273,7 @@
for page, path in exclusion_file_list():
if 'exclusion_list.txt' in path:
- result_list += re.sub("</?pre>","", read_file(path, cut_comment = True)).splitlines()
+ result_list += re.sub("</?pre>","", read_file(path, cut_comment = True, cut_newlines = True)).splitlines()
else:
data = read_file(path)
# wikipedia:en:Wikipedia:Mirrors and forks
@@ -291,11 +299,12 @@
else:
result_list += [re.sub(" .*", "", entry)]
- result_list += read_file(appdir + 'exclusion_list.txt', cut_comment = True).splitlines()
+ result_list += read_file(appdir + 'exclusion_list.txt', cut_comment = True, cut_newlines = True).splitlines()
return result_list
-def read_file(filename, cut_comment = False):
+def read_file(filename, cut_comment = False, cut_newlines = False):
text = u""
+
f = codecs.open(filename, 'r','utf-8')
text = f.read()
f.close()
@@ -303,6 +312,9 @@
if cut_comment:
text = re.sub(" ?#.*", "", text)
+ if cut_newlines:
+ text = re.sub("(?m)^\r?\n", "", text)
+
return text
def write_log(text, filename = output_file):
@@ -422,7 +434,6 @@
# Google limit queries to 32 words.
-
output = u""
n_query = 0
previous_group_url = 'none'
@@ -450,13 +461,11 @@
if results:
group_url_list = group_url.splitlines()
group_url_list.sort()
-
group_url = '\n'.join(group_url_list)
if previous_group_url == group_url:
if consecutive:
output += ' ' + search_words
else:
-
output += '\n**' + search_words
else:
output += group_url + '\n**' + search_words
@@ -632,6 +641,11 @@
except wikipedia.IsRedirectPage:
original_text = page.get(get_redirect=True)
+ if skip_disambig:
+ if page.isDisambig():
+ wikipedia.output(u'Page %s is a disambiguation page' % page.title())
+ continue
+
# colors = [13] * len(page.title())
wikipedia.output(page.title())
@@ -648,8 +662,11 @@
return False
return var
+def setSavepath(path):
+ global output_file
+ output_file = path
+
def main():
- global output_file
gen = None
# pages which will be processed when the -page parameter is used
PageTitles = []
@@ -682,13 +699,17 @@
config.copyright_yahoo = True
elif arg == '-g':
config.copyright_google = True
+ elif arg == '-l':
+ config.copyright_msn = True
elif arg == '-ny':
config.copyright_yahoo = False
elif arg == '-ng':
config.copyright_google = False
+ elif arg == '-nl':
+ config.copyright_msn = False
elif arg.startswith('-output'):
if len(arg) >= 8:
- output_file = arg[8:]
+ setSavepath(arg[8:])
elif arg.startswith('-maxquery'):
if len(arg) >= 10:
config.copyright_max_query_for_page = int(arg[10:])
Revision: 4086
Author: btongminh
Date: 2007-08-22 08:50:02 +0000 (Wed, 22 Aug 2007)
Log Message:
-----------
Dutch translation fixes by Waddr.
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2007-08-22 08:38:50 UTC (rev 4085)
+++ trunk/pywikipedia/weblinkchecker.py 2007-08-22 08:50:02 UTC (rev 4086)
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
"""
This bot is used for checking external links found at the wiki. It checks
several pages at once, with a limit set by the config variable
@@ -101,7 +101,7 @@
talk_report_archive = {
'de': u'Die Webseite wurde vom Internet Archive gespeichert. Bitte verlinke gegebenenfalls eine geeignete archivierte Version: [%s]. ',
'en': u'\nThe web page has been saved by the Internet Archive. Please consider linking to an appropriate archived version: [%s]. ',
- 'nl': u'\nDeze website is bewaard in het Internet Archief. Overweeg te linken naar een gearchiveerde pagina: [%s]. ',
+ 'nl': u'\nDeze website is bewaard in het Internet Archive. Overweeg te linken naar een gearchiveerde pagina: [%s]. ',
'no': u'\nDenne nettsiden er lagra i Internet Archive. Vurder om lenka kan endres til å peke til en av de arkiverte versjonene: [%s]. ',
'pt': u'Esta página web foi gravada na Internet Archive. Por favor considere o link para a versão arquivada: [%s]. ',
}
Revision: 4083
Author: a_engels
Date: 2007-08-21 23:44:54 +0000 (Tue, 21 Aug 2007)
Log Message:
-----------
update - nl: now uses localized names for special pages
Modified Paths:
--------------
trunk/pywikipedia/rcsort.py
Modified: trunk/pywikipedia/rcsort.py
===================================================================
--- trunk/pywikipedia/rcsort.py 2007-08-21 22:27:47 UTC (rev 4082)
+++ trunk/pywikipedia/rcsort.py 2007-08-21 23:44:54 UTC (rev 4083)
@@ -1,11 +1,10 @@
-#!/usr/bin/python
+#!/usr/bin/python
# -*- coding: utf-8 -*-
# A tool to see the recentchanges ordered by user instead of by date. This
# is meant to be run as a CGI script.
# Currently only works on Dutch Wikipedia, I do intend to make it more generally
# usable.
# Permission has been asked to run this on the toolserver.
-__version__ = '$Id$'
import cgi
import cgitb
@@ -26,9 +25,8 @@
import wikipedia
print "-->"
mysite = wikipedia.getSite()
-special = mysite.family.special_namespace(mysite.lang)
-post = 'title=%s:Recentchanges' % special
+post = 'title=Speciaal:RecenteWijzigingen'
for element in form:
post += '&%s=%s'%(element,form[element].value)
if not 'limit' in form:
@@ -38,7 +36,7 @@
text = text.split('\n')
rcoptions = False
lines = []
-Ruser = re.compile('title=\"%s\:Contributions\/([^\"]*)\"' % special)
+Ruser = re.compile('title=\"Speciaal\:Bijdragen\/([^\"]*)\"')
Rnumber = re.compile('tabindex=\"(\d*)\"')
count = 0
for line in text:
@@ -50,9 +48,8 @@
user = None
count += 1
lines.append((user,count,line))
- print
elif line.find('rcoptions') > -1:
- print line.replace("/w/index.php?title=%s:Recentchanges&" % special,"rcsort.py?")
+ print line.replace("/w/index.php?title=Speciaal:RecenteWijzigingen&","rcsort.py?")
rcoptions = True
lines.sort()
last = 0
@@ -63,10 +60,10 @@
if line[0] == None:
print "<h2>Gebruiker onbekend</h2>"
else:
- wikipedia.output(u"<h2>%s</h2>"%line[0], toStdout=True)
+ wikipedia.output(u"<h2>%s</h2>"%line[0],showcgi=True)
print "<ul>"
last = line[0]
- wikipedia.output(line[2].replace('href="/w','href="http://nl.wikipedia.org/w'), toStdout = True)
+ wikipedia.output(line[2].replace('href="/w','href="http://nl.wikipedia.org/w'), showcgi = True)
print
print "</ul>"
Feature Requests item #1777751, was opened at 2007-08-20 18:17
Message generated for change (Comment added) made by vargenau
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1777751&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Priority: 5
Private: No
Submitted By: Nobody/Anonymous (nobody)
Assigned to: Nobody/Anonymous (nobody)
Summary: interwiki.py in categories
Initial Comment:
It would be helpful to add functionality to interwiki.py to work with categories and subcategories, not just with articles contained within, i.e. to treat category pages as they were normal articles. Additional key to existing -cat option or whatever, please.
----------------------------------------------------------------------
Comment By: Marc-Etienne Vargenau (vargenau)
Date: 2007-08-21 10:49
Message:
Logged In: YES
user_id=1118700
Originator: NO
Hello,
I do not know if this was the original question, but there is still
something
I cannot do with subcategories.
Consider:
http://en.wikipedia.org/wiki/Category:France
The Category:France has 21 subcategories and 2 pages in it.
python interwiki.py Category:France
will work on the category itself.
python interwiki.py -cat:France
will work on the two pages France and Portal:France
How do I work on the 21 subcategories (i.e. category:Communications in
France,
etc.)?
Regards,
Marc-Etienne
----------------------------------------------------------------------
Comment By: Nobody/Anonymous (nobody)
Date: 2007-08-20 23:33
Message:
Logged In: NO
You can already work on categories interwiki. For example : python
interwiki.py -start:Category:Spain
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1777751&group_…
Bugs item #1705486, was opened at 2007-04-22 18:39
Message generated for change (Comment added) made by sf-robot
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1705486&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: General
Group: None
>Status: Closed
Resolution: Fixed
Priority: 5
Private: No
Submitted By: Cyde Weys (cydeweys)
Assigned to: Nobody/Anonymous (nobody)
Summary: Fall-through to admin account fails
Initial Comment:
pyWikipediaBot used to have the functionality that if an action was requested that required administrative rights, and if an admin account was specified in user-config.py, then it would fall-through and use the admin account as necessary. For instance, it used to be that when handling the en-wiki "Categories for deletion", all of the edits to move the categories would be done by my bot's account, and then when the category itself needed to be deleted, it would use my admin account. This no longer happens. It simply fails the deletion, citing that it doesn't have the proper rights. It's not using the admin account anymore.
----------------------------------------------------------------------
>Comment By: SourceForge Robot (sf-robot)
Date: 2007-08-20 19:20
Message:
Logged In: YES
user_id=1312539
Originator: NO
This Tracker item was closed automatically by the system. It was
previously set to a Pending status, and the original submitter
did not respond within 14 days (the time period specified by
the administrator of this Tracker).
----------------------------------------------------------------------
Comment By: Daniel Herding (wikipedian)
Date: 2007-06-13 10:20
Message:
Logged In: YES
user_id=880694
Originator: NO
Has your problem been fixed by the correction of this bug:
https://sourceforge.net/tracker/index.php?func=detail&aid=1677643&group_id=…
?
----------------------------------------------------------------------
Comment By: Cyde Weys (cydeweys)
Date: 2007-04-27 09:01
Message:
Logged In: YES
user_id=1506848
Originator: YES
I've tracked it down to line 1086 or thereabouts in wikipedia.py. Here is
the affected code:
try:
self.site().forceLogin(sysop = True)
output(u'Page is locked, and the sysop account can\'t
be used because of some bug. Giving up.')
# This doesn't work for an unknown reason; Instead,
it
# would lead to an infinite loop.
#output(u'Page is locked, retrying using sysop
account.')
#return self.putPage(text = text, comment = comment,
watchArticle = watchArticle, minorEdit = minorEdit, newPage = newPage,
token = None, gettoken = True, sysop = True)
As you can see, it appears that someone has commented out the auto-retry
using sysop account functionality, claiming it caused infinite loops. I
never had any problems with infinite loops, but I do now have problems with
not being able to edit locked pages. Should this be uncommented out, or is
there another way to handle this that won't cause an infinite loop?
----------------------------------------------------------------------
Comment By: Cyde Weys (cydeweys)
Date: 2007-04-22 18:41
Message:
Logged In: YES
user_id=1506848
Originator: YES
This appears to have the same underlying issue as bug #1682852.
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1705486&group_…