jenkins-bot has submitted this change and it was merged.
Change subject: weblinkchecker.py: add the ability to run the script with XML dump
......................................................................
weblinkchecker.py: add the ability to run the script with XML dump
Like reflinks.py
Bug: 55039
Change-Id: I7ba4f460897316ae1f5cbcca0080f8c3262d9abf
---
M weblinkchecker.py
1 file changed, 66 insertions(+), 0 deletions(-)
Approvals:
Ladsgroup: Looks good to me, approved
jenkins-bot: Verified
diff --git a/weblinkchecker.py b/weblinkchecker.py
index 71a0c1b..eafd7ed 100644
--- a/weblinkchecker.py
+++ b/weblinkchecker.py
@@ -36,6 +36,11 @@
-namespace Only process templates in the namespace with the given number or
name. This parameter may be used multiple times.
+-xml Should be used instead of a simple page fetching method from
+ pagegenerators.py for performance and load issues
+
+-xmlstart Page to start with when using an XML dump
+
-ignore HTTP return codes to ignore. Can be provided several times :
-ignore:401 -ignore:500
@@ -109,6 +114,7 @@
from pywikibot import i18n
import config
import pagegenerators
+import xmlreader
import pywikibot.weblib
docuReplacements = {
@@ -173,6 +179,47 @@
yield m.group('url')
else:
yield m.group('urlb')
+
+
+class XmlDumpPageGenerator:
+ """Xml generator that yiels pages containing a web
link"""
+
+ def __init__(self, xmlFilename, xmlStart, namespaces):
+ self.xmlStart = xmlStart
+ self.namespaces = namespaces
+ self.skipping = bool(xmlStart)
+ self.site = pywikibot.getSite()
+
+ dump = xmlreader.XmlDump(xmlFilename)
+ self.parser = dump.parse()
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ try:
+ for entry in self.parser:
+ if self.skipping:
+ if entry.title != self.xmlStart:
+ continue
+ self.skipping = False
+ page = pywikibot.Page(self.site, entry.title)
+ if not self.namespaces == []:
+ if page.namespace() not in self.namespaces:
+ continue
+ found = False
+ for url in weblinksIn(entry.text):
+ found = True
+ if found:
+ return page
+ except KeyboardInterrupt:
+ try:
+ if not self.skipping:
+ pywikibot.output(
+ u'To resume, use "-xmlstart:%s" on the command
line.'
+ % entry.title)
+ except NameError:
+ pass
class LinkChecker(object):
@@ -754,6 +801,7 @@
def main():
gen = None
singlePageTitle = []
+ xmlFilename = None
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
@@ -780,6 +828,17 @@
HTTPignore.append(int(arg[8:]))
elif arg.startswith('-day:'):
day = int(arg[5:])
+ elif arg.startswith('-xmlstart'):
+ if len(arg) == 9:
+ xmlStart = pywikibot.input(
+ u'Please enter the dumped article to start with:')
+ else:
+ xmlStart = arg[10:]
+ elif arg.startswith('-xml'):
+ if len(arg) == 4:
+ xmlFilename = i18n.input('pywikibot-enter-xml-filename')
+ else:
+ xmlFilename = arg[5:]
else:
if not genFactory.handleArg(arg):
singlePageTitle.append(arg)
@@ -789,6 +848,13 @@
page = pywikibot.Page(pywikibot.getSite(), singlePageTitle)
gen = iter([page])
+ if xmlFilename:
+ try:
+ xmlStart
+ except NameError:
+ xmlStart = None
+ gen = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
+
if not gen:
gen = genFactory.getCombinedGenerator()
if gen:
--
To view, visit
https://gerrit.wikimedia.org/r/96272
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7ba4f460897316ae1f5cbcca0080f8c3262d9abf
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/compat
Gerrit-Branch: master
Gerrit-Owner: Beta16 <l.rabinelli(a)gmail.com>
Gerrit-Reviewer: Beta16 <l.rabinelli(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>