[Gerrit] weblinkchecker.py: add the ability to run the script with XM... - change (pywikibot/compat) - Pywikibot-commits

5 Apr 2014

jenkins-bot has submitted this change and it was merged.

Change subject: weblinkchecker.py: add the ability to run the script with XML dump
......................................................................


weblinkchecker.py: add the ability to run the script with XML dump

Like reflinks.py
Bug: 55039

Change-Id: I7ba4f460897316ae1f5cbcca0080f8c3262d9abf
---
M weblinkchecker.py
1 file changed, 66 insertions(+), 0 deletions(-)

Approvals:
  Ladsgroup: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/weblinkchecker.py b/weblinkchecker.py
index 71a0c1b..eafd7ed 100644
--- a/weblinkchecker.py
+++ b/weblinkchecker.py
@@ -36,6 +36,11 @@
 -namespace   Only process templates in the namespace with the given number or
              name. This parameter may be used multiple times.
 
+-xml         Should be used instead of a simple page fetching method from
+             pagegenerators.py for performance and load issues
+
+-xmlstart    Page to start with when using an XML dump
+
 -ignore      HTTP return codes to ignore. Can be provided several times :
                 -ignore:401 -ignore:500
 
@@ -109,6 +114,7 @@
 from pywikibot import i18n
 import config
 import pagegenerators
+import xmlreader
 import pywikibot.weblib
 
 docuReplacements = {
@@ -173,6 +179,47 @@
             yield m.group('url')
         else:
             yield m.group('urlb')
+
+
+class XmlDumpPageGenerator:
+    """Xml generator that yiels pages containing a web
link"""
+
+    def __init__(self, xmlFilename, xmlStart, namespaces):
+        self.xmlStart = xmlStart
+        self.namespaces = namespaces
+        self.skipping = bool(xmlStart)
+        self.site = pywikibot.getSite()
+
+        dump = xmlreader.XmlDump(xmlFilename)
+        self.parser = dump.parse()
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        try:
+            for entry in self.parser:
+                if self.skipping:
+                    if entry.title != self.xmlStart:
+                        continue
+                    self.skipping = False
+                page = pywikibot.Page(self.site, entry.title)
+                if not self.namespaces == []:
+                    if page.namespace() not in self.namespaces:
+                        continue
+                found = False
+                for url in weblinksIn(entry.text):
+                    found = True
+                if found:
+                    return page
+        except KeyboardInterrupt:
+            try:
+                if not self.skipping:
+                    pywikibot.output(
+                        u'To resume, use "-xmlstart:%s" on the command
line.'
+                        % entry.title)
+            except NameError:
+                pass
 
 
 class LinkChecker(object):
@@ -754,6 +801,7 @@
 def main():
     gen = None
     singlePageTitle = []
+    xmlFilename = None
     # Which namespaces should be processed?
     # default to [] which means all namespaces will be processed
     namespaces = []
@@ -780,6 +828,17 @@
             HTTPignore.append(int(arg[8:]))
         elif arg.startswith('-day:'):
             day = int(arg[5:])
+        elif arg.startswith('-xmlstart'):
+            if len(arg) == 9:
+                xmlStart = pywikibot.input(
+                    u'Please enter the dumped article to start with:')
+            else:
+                xmlStart = arg[10:]
+        elif arg.startswith('-xml'):
+            if len(arg) == 4:
+                xmlFilename = i18n.input('pywikibot-enter-xml-filename')
+            else:
+                xmlFilename = arg[5:]
         else:
             if not genFactory.handleArg(arg):
                 singlePageTitle.append(arg)
@@ -789,6 +848,13 @@
         page = pywikibot.Page(pywikibot.getSite(), singlePageTitle)
         gen = iter([page])
 
+    if xmlFilename:
+        try:
+            xmlStart
+        except NameError:
+            xmlStart = None
+        gen = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
+
     if not gen:
         gen = genFactory.getCombinedGenerator()
     if gen:

-- 
To view, visit https://gerrit.wikimedia.org/r/96272
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I7ba4f460897316ae1f5cbcca0080f8c3262d9abf
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/compat
Gerrit-Branch: master
Gerrit-Owner: Beta16 &lt;l.rabinelli(a)gmail.com&gt;
Gerrit-Reviewer: Beta16 &lt;l.rabinelli(a)gmail.com&gt;
Gerrit-Reviewer: Ladsgroup &lt;ladsgroup(a)gmail.com&gt;
Gerrit-Reviewer: Legoktm &lt;legoktm.wikipedia(a)gmail.com&gt;
Gerrit-Reviewer: Xqt &lt;info(a)gno.de&gt;
Gerrit-Reviewer: jenkins-bot <>