Revision: 7942
Author: xqt
Date: 2010-02-17 05:53:13 +0000 (Wed, 17 Feb 2010)
Log Message:
-----------
Bugfix for #2952927
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-02-16 17:07:50 UTC (rev 7941)
+++ trunk/pywikipedia/wikipedia.py 2010-02-17 05:53:13 UTC (rev 7942)
@@ -4395,9 +4395,6 @@
if nothing is changed, it is added at the end
"""
- # Hyperlink regex is defined in weblinkchecker.py
- import weblinkchecker
-
if site is None:
site = getSite()
@@ -4431,7 +4428,7 @@
# depth, we'd need recursion which can't be done in Python's re.
# After all, the language of correct parenthesis words is not regular.
'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'),
- 'hyperlink': weblinkchecker.compileLinkR(),
+ 'hyperlink': compileLinkR(),
'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
# this matches internal wikilinks, but also interwiki, categories, and
# images.
Revision: 7941
Author: xqt
Date: 2010-02-16 17:07:50 +0000 (Tue, 16 Feb 2010)
Log Message:
-----------
Bugfix for #2946258: Test status code. I would prefer going into loop introduced in r5014 but this is old stuff.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-02-16 17:01:39 UTC (rev 7940)
+++ trunk/pywikipedia/wikipedia.py 2010-02-16 17:07:50 UTC (rev 7941)
@@ -2151,7 +2151,7 @@
return self._putPageOld(text, comment, watchArticle, minorEdit, newPage, token, newToken, sysop, captcha=solve)
# We are expecting a 302 to the action=view page. I'm not sure why this was removed in r5019
- if data.strip() != u"":
+ if response.status != 302 and data.strip() != u"":
# Something went wrong, and we don't know what. Show the
# HTML code that hopefully includes some error message.
output(u"ERROR: Unexpected response from wiki server.")
Revision: 7940
Author: xqt
Date: 2010-02-16 17:01:39 +0000 (Tue, 16 Feb 2010)
Log Message:
-----------
put weblinkchecker.compileLinkR() into wikipedia library (textlib update from rewrite)
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2010-02-16 16:02:34 UTC (rev 7939)
+++ trunk/pywikipedia/weblinkchecker.py 2010-02-16 17:01:39 UTC (rev 7940)
@@ -202,32 +202,6 @@
re.compile('.*[\./(a)]bodo\.kommune\.no(/.*)?'), # bot can't handle their redirects
]
-def compileLinkR(withoutBracketed = False, onlyBracketed = False):
- # RFC 2396 says that URLs may only contain certain characters.
- # For this regex we also accept non-allowed characters, so that the bot
- # will later show these links as broken ('Non-ASCII Characters in URL').
- # Note: While allowing parenthesis inside URLs, MediaWiki will regard
- # right parenthesis at the end of the URL as not part of that URL.
- # The same applies to dot, comma, colon and some other characters.
- notAtEnd = '\]\s\)\.:;,<>"'
- # So characters inside the URL can be anything except whitespace,
- # closing squared brackets, quotation marks, greater than and less
- # than, and the last character also can't be parenthesis or another
- # character disallowed by MediaWiki.
- notInside = '\]\s<>"'
- # The first half of this regular expression is required because '' is
- # not allowed inside links. For example, in this wiki text:
- # ''Please see http://www.example.org.''
- # .'' shouldn't be considered as part of the link.
- regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside + ']*[^' + notAtEnd + '])'
-
- if withoutBracketed:
- regex = r'(?<!\[)' + regex
- elif onlyBracketed:
- regex = r'\[' + regex
- linkR = re.compile(regex)
- return linkR
-
def weblinksIn(text, withoutBracketed = False, onlyBracketed = False):
text = wikipedia.removeDisabledParts(text)
@@ -245,7 +219,7 @@
while templateWithParamsR.search(text):
text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
- linkR = compileLinkR(withoutBracketed, onlyBracketed)
+ linkR = wikipedia.compileLinkR(withoutBracketed, onlyBracketed)
# Remove HTML comments in URLs as well as URLs in HTML comments.
# Also remove text inside nowiki links etc.
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-02-16 16:02:34 UTC (rev 7939)
+++ trunk/pywikipedia/wikipedia.py 2010-02-16 17:01:39 UTC (rev 7940)
@@ -4963,6 +4963,35 @@
#catLinks.sort()
return sep.join(catLinks) + '\r\n'
+def compileLinkR(withoutBracketed=False, onlyBracketed=False):
+ """Return a regex that matches external links."""
+ # RFC 2396 says that URLs may only contain certain characters.
+ # For this regex we also accept non-allowed characters, so that the bot
+ # will later show these links as broken ('Non-ASCII Characters in URL').
+ # Note: While allowing parenthesis inside URLs, MediaWiki will regard
+ # right parenthesis at the end of the URL as not part of that URL.
+ # The same applies to dot, comma, colon and some other characters.
+ notAtEnd = '\]\s\)\.:;,<>"'
+ # So characters inside the URL can be anything except whitespace,
+ # closing squared brackets, quotation marks, greater than and less
+ # than, and the last character also can't be parenthesis or another
+ # character disallowed by MediaWiki.
+ notInside = '\]\s<>"'
+ # The first half of this regular expression is required because '' is
+ # not allowed inside links. For example, in this wiki text:
+ # ''Please see http://www.example.org.''
+ # .'' shouldn't be considered as part of the link.
+ regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \
+ + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside \
+ + ']*[^' + notAtEnd + '])'
+
+ if withoutBracketed:
+ regex = r'(?<!\[)' + regex
+ elif onlyBracketed:
+ regex = r'\[' + regex
+ linkR = re.compile(regex)
+ return linkR
+
# end of category specific code
def url2link(percentname, insite, site):
"""Convert urlname of a wiki page into interwiki link format.
Revision: 7938
Author: xqt
Date: 2010-02-16 09:54:23 +0000 (Tue, 16 Feb 2010)
Log Message:
-----------
Increase the number of pages returned by UserContributionsGenerator() as given by the argument (fix for #2930108)
Modified Paths:
--------------
trunk/pywikipedia/pagegenerators.py
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2010-02-16 07:20:42 UTC (rev 7937)
+++ trunk/pywikipedia/pagegenerators.py 2010-02-16 09:54:23 UTC (rev 7938)
@@ -108,8 +108,13 @@
Argument can be given as "-unwatched:n" where
n is the maximum number of articles to work on.
--usercontribs Work on all articles that were edited by a certain user :
- Example : -usercontribs:DumZiBoT
+-usercontribs Work on articles that were edited by a certain user.
+ Example: -usercontribs:DumZiBoT
+ Normally up to 250 distinct pages are given. To get an other
+ number of pages, add the number behind the username
+ delimited with ";"
+ Example: -usercontribs:DumZiBoT;500
+ returns 500 distinct pages to work on.
-weblink Work on all articles that contain an external link to
a given URL; may be given as "-weblink:url"
@@ -148,6 +153,9 @@
-yahoo Work on all pages that are found in a Yahoo search.
Depends on python module pYsearch. See yahoo_appid in
config.py for instructions.
+
+-page Work on a single page. Argument can also be given as
+ "-page:pagetitle".
"""
docuReplacements = {'¶ms;': parameterHelp}
@@ -485,12 +493,8 @@
Yields number unique pages edited by user:username
namespaces : list of namespace numbers to fetch contribs from
"""
-
if site is None:
site = pywikibot.getSite()
- if number > 500:
- # the api does not allow more than 500 results for anonymous users
- number = 500
user = userlib.User(site, username)
for page in user.contributions(number, namespaces):
yield page[0]
@@ -994,7 +998,13 @@
else:
gen = UnwatchedPagesPageGenerator(number = int(arg[11:]))
elif arg.startswith('-usercontribs'):
- gen = UserContributionsGenerator(arg[14:])
+ args = arg[14:].split(';')
+ number = None
+ try:
+ number = int(args[1])
+ except:
+ number = 250
+ gen = UserContributionsGenerator(args[0], number)
elif arg.startswith('-withoutinterwiki'):
if len(arg) == 17:
gen = WithoutInterwikiPageGenerator()
Revision: 7937
Author: xqt
Date: 2010-02-16 07:20:42 +0000 (Tue, 16 Feb 2010)
Log Message:
-----------
doc for new behavior
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2010-02-15 15:07:12 UTC (rev 7936)
+++ trunk/pywikipedia/interwiki.py 2010-02-16 07:20:42 UTC (rev 7937)
@@ -186,10 +186,12 @@
These arguments specify in which way the bot should follow interwiki links:
- -noredirect do not follow redirects. (note: without ending colon)
+ -noredirect do not follow redirects nor category redirects.
+ (note: without ending colon)
- -initialredirect work on its target if a redirect is entered on the
- command line. (note: without ending colon)
+ -initialredirect work on its target if a redirect or category redirect is
+ entered on the command line or by a generator.
+ (note: without ending colon)
-neverlink: used as -neverlink:xx where xx is a language code:
Disregard any links found to language xx. You can also
Revision: 7936
Author: xqt
Date: 2010-02-15 15:07:12 +0000 (Mon, 15 Feb 2010)
Log Message:
-----------
iw: recognize categoryRedirect templates (bugfix for 2949822)
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2010-02-15 14:49:55 UTC (rev 7935)
+++ trunk/pywikipedia/interwiki.py 2010-02-15 15:07:12 UTC (rev 7936)
@@ -287,7 +287,7 @@
# (C) Rob W.W. Hooft, 2003
# (C) Daniel Herding, 2004
# (C) Yuri Astrakhan, 2005-2006
-# (C) Pywikipedia bot team, 2007-2009
+# (C) Pywikipedia bot team, 2007-2010
#
# Distributed under the terms of the MIT license.
#
@@ -937,7 +937,8 @@
"""
for tree in [self.done, self.pending]:
for page in tree.filter(site):
- if page.exists() and not page.isDisambig() and not page.isRedirectPage():
+ if page.exists() and not page.isDisambig() \
+ and not page.isRedirectPage() and not page.isCategoryRedirect():
return page
return None
@@ -951,7 +952,7 @@
for tree in [self.done, self.pending, self.todo]:
for page in tree.filter(site):
if page.namespace() == self.originPage.namespace():
- if page.exists() and not page.isRedirectPage():
+ if page.exists() and not page.isRedirectPage() and not page.isCategoryRedirect():
return page
return None
@@ -1182,7 +1183,8 @@
if not self.workonme:
# Do not ask hints for pages that we don't work on anyway
return
- if (self.untranslated or globalvar.askhints) and not self.hintsAsked and not self.originPage.isRedirectPage():
+ if (self.untranslated or globalvar.askhints) and not self.hintsAsked \
+ and not self.originPage.isRedirectPage() and not self.originPage.isCategoryRedirect():
# Only once!
self.hintsAsked = True
if globalvar.untranslated:
@@ -1253,23 +1255,33 @@
self.done = PageTree()
continue
- elif page.isRedirectPage():
+ elif page.isRedirectPage() or page.isCategoryRedirect():
+ if page.isRedirectPage():
+ redir = u''
+ else:
+ redir = u'category '
try:
- redirectTargetPage = page.getRedirectTarget()
+ if page.isRedirectPage():
+ redirectTargetPage = page.getRedirectTarget()
+ else:
+ redirectTargetPage = page.getCategoryRedirectTarget()
except pywikibot.InvalidTitle:
# MW considers #redirect [[en:#foo]] as a redirect page,
# but we can't do anything useful with such pages
if not globalvar.quiet or pywikibot.verbose:
- pywikibot.output(u"NOTE: %s redirects to an invalid title" % page.aslink(True))
+ pywikibot.output(u"NOTE: %s redirects to an invalid title"
+ % page.aslink(True))
continue
if not globalvar.quiet or pywikibot.verbose:
- pywikibot.output(u"NOTE: %s is redirect to %s" % (page.aslink(True), redirectTargetPage.aslink(True)))
+ pywikibot.output(u"NOTE: %s is %sredirect to %s"
+ % (page.aslink(True), redir, redirectTargetPage.aslink(True)))
if page == self.originPage:
if globalvar.initialredirect:
if globalvar.contentsondisk:
redirectTargetPage = StoredPage(redirectTargetPage)
#don't follow double redirects; it might be a self loop
- if not redirectTargetPage.isRedirectPage():
+ if not redirectTargetPage.isRedirectPage() \
+ and not redirectTargetPage.isCategoryRedirect():
self.originPage = redirectTargetPage
self.todo.add(redirectTargetPage)
counter.plus(redirectTargetPage.site())
@@ -1282,12 +1294,13 @@
self.todo = PageTree()
elif not globalvar.followredirect:
if not globalvar.quiet or pywikibot.verbose:
- pywikibot.output(u"NOTE: not following redirects.")
+ pywikibot.output(u"NOTE: not following %sredirects." % redir)
elif page.site().family == redirectTargetPage.site().family \
and not self.skipPage(page, redirectTargetPage, counter):
if self.addIfNew(redirectTargetPage, counter, page):
if config.interwiki_shownew or pywikibot.verbose:
- pywikibot.output(u"%s: %s gives new redirect %s" % (self.originPage.aslink(), page.aslink(True), redirectTargetPage.aslink(True)))
+ pywikibot.output(u"%s: %s gives new %sredirect %s"
+ % (self.originPage.aslink(), page.aslink(True), redir, redirectTargetPage.aslink(True)))
continue
# must be behind the page.isRedirectPage() part
@@ -1326,7 +1339,7 @@
duplicate = None
for p in self.done.filter(page.site()):
- if p != page and p.exists() and not p.isRedirectPage():
+ if p != page and p.exists() and not p.isRedirectPage() and not p.isCategoryRedirect():
duplicate = p
break
@@ -1428,7 +1441,7 @@
# Each value will be a list of pages.
new = {}
for page in self.done:
- if page.exists() and not page.isRedirectPage():
+ if page.exists() and not page.isRedirectPage() and not page.isCategoryRedirect():
site = page.site()
if site == self.originPage.site():
if page != self.originPage:
@@ -1535,6 +1548,8 @@
return
if self.originPage.isRedirectPage():
return
+ if self.originPage.isCategoryRedirect():
+ return
if not self.untranslated and globalvar.untranslatedonly:
return
# The following check is not always correct and thus disabled.