jenkins-bot has submitted this change and it was merged.
Change subject: Do not require login for pure logging purposes
......................................................................
Do not require login for pure logging purposes
If the user is not logged in, then don't display their talk page status, as
this would require a log-in. This is annoying if one just needs read-only
access, or needs to read a wiki that has a broken login (e.g. toolserver)
Change-Id: Ia46d1938221ef70aaf2a752f1b9a4cc5493d34dc
---
M pywikibot/bot.py
1 file changed, 8 insertions(+), 1 deletion(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/bot.py b/pywikibot/bot.py
index 13aaf41..2d478c5 100644
--- a/pywikibot/bot.py
+++ b/pywikibot/bot.py
@@ -272,7 +272,14 @@
log(u' %s' % ver)
# messages on bot discussion page?
- log(u'MESSAGES: %s' % ('unanswered' if site.messages() else 'none'))
+ if site.logged_in():
+ if site.messages():
+ messagestate = 'unanswered'
+ else:
+ messagestate = 'none'
+ else:
+ messagestate = 'unknown (not logged in)'
+ log(u'MESSAGES: %s' % messagestate)
log(u'=== ' * 14)
--
To view, visit https://gerrit.wikimedia.org/r/104962
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ia46d1938221ef70aaf2a752f1b9a4cc5493d34dc
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged.
Change subject: Weblib: add docs, replace string concat with urlencode
......................................................................
Weblib: add docs, replace string concat with urlencode
Change-Id: I18c8b7b4c47aba68cffd3435be7fdf4056e3620d
---
M pywikibot/weblib.py
1 file changed, 39 insertions(+), 19 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/weblib.py b/pywikibot/weblib.py
index d068925..c2ad86e 100644
--- a/pywikibot/weblib.py
+++ b/pywikibot/weblib.py
@@ -11,21 +11,31 @@
#
__version__ = '$Id$'
-import pywikibot
+import urllib
from pywikibot.comms import http
def getInternetArchiveURL(url, timestamp=None):
- """Return archived URL by Internet Archive."""
- # See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php
+ """Return archived URL by Internet Archive.
+
+ Parameters:
+ url - url to search an archived version for
+ timestamp - requested archive date. The version closest to that moment
+ is returned. Format: YYYYMMDDhhmmss or part thereof.
+
+ See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php
+ for more details.
+ """
import json
- query = u'http://archive.org/wayback/available?'
- query += u'url='
- query += url
- if not timestamp is None:
- query += u'×tamp='
- query += timestamp
- jsontext = http.request(uri=query, site=None)
+ uri = u'http://archive.org/wayback/available?'
+
+ query = {'url': url}
+
+ if timestamp is not None:
+ query['timestamp'] = timestamp
+
+ uri = uri + urllib.urlencode(query)
+ jsontext = http.request(uri=uri, site=None)
if "closest" in jsontext:
data = json.loads(jsontext)
return data['archived_snapshots']['closest']['url']
@@ -34,17 +44,27 @@
def getWebCitationURL(url, timestamp=None):
- """Return archived URL by Web Citation."""
- # See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
+ """Return archived URL by Web Citation.
+
+ Parameters:
+ url - url to search an archived version for
+ timestamp - requested archive date. The version closest to that moment
+ is returned. Format: YYYYMMDDhhmmss or part thereof.
+
+ See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
+ for more details
+ """
import xml.etree.ElementTree as ET
- query = u'http://www.webcitation.org/query?'
- query += u'returnxml=true'
- query += u'&url='
- query += url
+ uri = u'http://www.webcitation.org/query?'
+
+ query = {'returnxml': 'true',
+ 'url': url}
+
if not timestamp is None:
- query += u'&date='
- query += timestamp
- xmltext = http.request(uri=query, site=None)
+ query['date'] = timestamp
+
+ uri = uri + urllib.urlencode(query)
+ xmltext = http.request(uri=uri, site=None)
if "success" in xmltext:
data = ET.fromstring(xmltext)
return data.find('.//webcite_url').text
--
To view, visit https://gerrit.wikimedia.org/r/104804
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I18c8b7b4c47aba68cffd3435be7fdf4056e3620d
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged.
Change subject: Prevent Page.change_category from readding newCat.
......................................................................
Prevent Page.change_category from readding newCat.
Currently change_category also adds newCat if it
is already present at the target page. This
behavior had to be fixed manually by calling
functions like category.py and causes extra code
and complexity.
Change-Id: I95ba291e78c2f187f4d4a881b39fa44096cca9b6
---
M pywikibot/page.py
1 file changed, 4 insertions(+), 0 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/page.py b/pywikibot/page.py
index f0150cd..2cda41e 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -1479,6 +1479,10 @@
% (self.title(asLink=True), oldCat.title()))
return
+ # This prevents the bot from adding newCat if it is already present.
+ if newCat in cats:
+ newCat = None
+
if inPlace or self.namespace() == 10:
oldtext = self.get(get_redirect=True)
newtext = pywikibot.replaceCategoryInPlace(oldtext, oldCat, newCat)
--
To view, visit https://gerrit.wikimedia.org/r/104812
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I95ba291e78c2f187f4d4a881b39fa44096cca9b6
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Pyfisch <pyfisch(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged.
Change subject: [BUGFIX] change Site.lang to Site.code
......................................................................
[BUGFIX] change Site.lang to Site.code
The i18n files use WMF language codes instead of ISO 639 language
codes. This means we also have to use these in our translations.
e.g. site code of the Alemannic wikipedia is 'als' whereas the
language code is 'gsw'. The i18n files use 'als', while we now
try to lookup 'gsw', which does not exist.
Change-Id: I3bd186c06ef3b0506411f944f36f1b999fb35dfe
---
M pywikibot/i18n.py
1 file changed, 8 insertions(+), 8 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/i18n.py b/pywikibot/i18n.py
index bb2b26d..40835c0 100644
--- a/pywikibot/i18n.py
+++ b/pywikibot/i18n.py
@@ -259,9 +259,9 @@
family = pywikibot.config.family
# If a site is given instead of a code, use its language
- if hasattr(code, 'lang'):
+ if hasattr(code, 'code'):
family = code.family.name
- code = code.lang
+ code = code.code
# Check whether xdict has multiple projects
if type(xdict) == dict:
@@ -336,8 +336,8 @@
code_needed = False
# If a site is given instead of a code, use its language
- if hasattr(code, 'lang'):
- lang = code.lang
+ if hasattr(code, 'code'):
+ lang = code.code
# check whether we need the language code back
elif type(code) == list:
lang = code.pop()
@@ -432,8 +432,8 @@
if type(parameters) == dict:
param = parameters
# If a site is given instead of a code, use its language
- if hasattr(code, 'lang'):
- code = code.lang
+ if hasattr(code, 'code'):
+ code = code.code
# we send the code via list and get the alternate code back
code = [code]
trans = twtranslate(code, twtitle, None)
@@ -484,8 +484,8 @@
package = twtitle.split("-")[0]
transdict = getattr(__import__("i18n", fromlist=[package]), package).msg
# If a site is given instead of a code, use its language
- if hasattr(code, 'lang'):
- code = code.lang
+ if hasattr(code, 'code'):
+ code = code.code
return code in transdict and twtitle in transdict[code]
--
To view, visit https://gerrit.wikimedia.org/r/104800
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I3bd186c06ef3b0506411f944f36f1b999fb35dfe
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Russell Blau <russblau(a)imapmail.org>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged.
Change subject: weblinkchecker.py : XML and archived URL
......................................................................
weblinkchecker.py : XML and archived URL
Same as the follow for compat:
* I7ba4f460897316ae1f5cbcca0080f8c3262d9abf : read XML dump
* I46c1737aea471691cd90f9ec21e3592ce0c69fde : Internet Archive and Web Citation
Bug: 55039
Bug: 58815
Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43
---
A pywikibot/weblib.py
M scripts/weblinkchecker.py
2 files changed, 123 insertions(+), 29 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/weblib.py b/pywikibot/weblib.py
new file mode 100644
index 0000000..d068925
--- /dev/null
+++ b/pywikibot/weblib.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+"""
+Functions for manipulating external links
+or querying third-party sites.
+
+"""
+#
+# (C) Pywikibot team, 2013
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import pywikibot
+from pywikibot.comms import http
+
+
+def getInternetArchiveURL(url, timestamp=None):
+ """Return archived URL by Internet Archive."""
+ # See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php
+ import json
+ query = u'http://archive.org/wayback/available?'
+ query += u'url='
+ query += url
+ if not timestamp is None:
+ query += u'×tamp='
+ query += timestamp
+ jsontext = http.request(uri=query, site=None)
+ if "closest" in jsontext:
+ data = json.loads(jsontext)
+ return data['archived_snapshots']['closest']['url']
+ else:
+ return None
+
+
+def getWebCitationURL(url, timestamp=None):
+ """Return archived URL by Web Citation."""
+ # See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
+ import xml.etree.ElementTree as ET
+ query = u'http://www.webcitation.org/query?'
+ query += u'returnxml=true'
+ query += u'&url='
+ query += url
+ if not timestamp is None:
+ query += u'&date='
+ query += timestamp
+ xmltext = http.request(uri=query, site=None)
+ if "success" in xmltext:
+ data = ET.fromstring(xmltext)
+ return data.find('.//webcite_url').text
+ else:
+ return None
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index fe138c7..40f283a 100644
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -36,6 +36,11 @@
-namespace Only process templates in the namespace with the given number or
name. This parameter may be used multiple times.
+-xml Should be used instead of a simple page fetching method from
+ pagegenerators.py for performance and load issues
+
+-xmlstart Page to start with when using an XML dump
+
-ignore HTTP return codes to ignore. Can be provided several times :
-ignore:401 -ignore:500
@@ -112,6 +117,8 @@
from pywikibot import i18n
from pywikibot import config
from pywikibot import pagegenerators
+from pywikibot import xmlreader
+from pywikibot import weblib
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp
@@ -177,29 +184,45 @@
yield m.group('urlb')
-class InternetArchiveConsulter:
- def __init__(self, url):
- self.url = url
+class XmlDumpPageGenerator:
+ """Xml generator that yiels pages containing a web link"""
- def getArchiveURL(self):
- pywikibot.output(u'Consulting the Internet Archive for %s' % self.url)
- archiveURL = 'http://web.archive.org/web/*/%s' % self.url
+ def __init__(self, xmlFilename, xmlStart, namespaces):
+ self.xmlStart = xmlStart
+ self.namespaces = namespaces
+ self.skipping = bool(xmlStart)
+ self.site = pywikibot.getSite()
+
+ dump = xmlreader.XmlDump(xmlFilename)
+ self.parser = dump.parse()
+
+ def __iter__(self):
+ return self
+
+ def next(self):
try:
- f = urllib2.urlopen(archiveURL)
- except urllib2.HTTPError:
- # The Internet Archive yields a 403 error when the site was not
- # archived due to robots.txt restrictions.
- return
- except UnicodeEncodeError:
- return
- data = f.read()
- if f.headers.get('content-encoding', None) == 'gzip':
- # Since 2008, the Internet Archive returns pages in GZIPed
- # compression format. Unfortunatelly urllib2 doesn't handle
- # the decompression for us, so we have to do it ourselves.
- data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
- if "Search Results for " in data:
- return archiveURL
+ for entry in self.parser:
+ if self.skipping:
+ if entry.title != self.xmlStart:
+ continue
+ self.skipping = False
+ page = pywikibot.Page(self.site, entry.title)
+ if not self.namespaces == []:
+ if page.namespace() not in self.namespaces:
+ continue
+ found = False
+ for url in weblinksIn(entry.text):
+ found = True
+ if found:
+ return page
+ except KeyboardInterrupt:
+ try:
+ if not self.skipping:
+ pywikibot.output(
+ u'To resume, use "-xmlstart:%s" on the command line.'
+ % entry.title)
+ except NameError:
+ pass
class LinkChecker(object):
@@ -509,10 +532,10 @@
def __init__(self, reportThread):
self.reportThread = reportThread
- site = pywikibot.getSite()
+ self.site = pywikibot.getSite()
self.semaphore = threading.Semaphore()
self.datfilename = pywikibot.config.datafilepath(
- 'deadlinks', 'deadlinks-%s-%s.dat' % (site.family.name, site.code))
+ 'deadlinks', 'deadlinks-%s-%s.dat' % (self.site.family.name, self.site.code))
# Count the number of logged links, so that we can insert captions
# from time to time
self.logCount = 0
@@ -528,7 +551,6 @@
"""
Logs an error report to a text file in the deadlinks subdirectory.
"""
- site = pywikibot.getSite()
if archiveURL:
errorReport = u'* %s ([%s archive])\n' % (url, archiveURL)
else:
@@ -541,8 +563,8 @@
pywikibot.output(u"** Logging link for deletion.")
txtfilename = pywikibot.config.datafilepath('deadlinks',
'results-%s-%s.txt'
- % (site.family.name,
- site.lang))
+ % (self.site.family.name,
+ self.site.lang))
txtfile = codecs.open(txtfilename, 'a', 'utf-8')
self.logCount += 1
if self.logCount % 30 == 0:
@@ -573,8 +595,9 @@
# We'll list it in a file so that it can be removed manually.
if timeSinceFirstFound > 60 * 60 * 24 * day:
# search for archived page
- iac = InternetArchiveConsulter(url)
- archiveURL = iac.getArchiveURL()
+ archiveURL = pywikibot.weblib.getInternetArchiveURL(url)
+ if archiveURL is None:
+ archiveURL = pywikibot.weblib.getWebCitationURL(url)
self.log(url, error, page, archiveURL)
else:
self.historyDict[url] = [(page.title(), now, error)]
@@ -781,6 +804,7 @@
def main():
gen = None
singlePageTitle = []
+ xmlFilename = None
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
@@ -807,6 +831,17 @@
HTTPignore.append(int(arg[8:]))
elif arg.startswith('-day:'):
day = int(arg[5:])
+ elif arg.startswith('-xmlstart'):
+ if len(arg) == 9:
+ xmlStart = pywikibot.input(
+ u'Please enter the dumped article to start with:')
+ else:
+ xmlStart = arg[10:]
+ elif arg.startswith('-xml'):
+ if len(arg) == 4:
+ xmlFilename = i18n.input('pywikibot-enter-xml-filename')
+ else:
+ xmlFilename = arg[5:]
else:
if not genFactory.handleArg(arg):
singlePageTitle.append(arg)
@@ -816,6 +851,13 @@
page = pywikibot.Page(pywikibot.getSite(), singlePageTitle)
gen = iter([page])
+ if xmlFilename:
+ try:
+ xmlStart
+ except NameError:
+ xmlStart = None
+ gen = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
+
if not gen:
gen = genFactory.getCombinedGenerator()
if gen:
@@ -824,7 +866,7 @@
# fetch at least 240 pages simultaneously from the wiki, but more if
# a high thread number is set.
pageNumber = max(240, config.max_external_links * 2)
- gen = pagegenerators.PreloadingGenerator(gen, pageNumber=pageNumber)
+ gen = pagegenerators.PreloadingGenerator(gen, step=pageNumber)
gen = pagegenerators.RedirectFilterPageGenerator(gen)
bot = WeblinkCheckerRobot(gen, HTTPignore)
try:
--
To view, visit https://gerrit.wikimedia.org/r/104015
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Beta16 <l.rabinelli(a)gmail.com>
Gerrit-Reviewer: Beta16 <l.rabinelli(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot