[Gerrit] Ported unlink.py for Core branch, not using mwparserfromhell... - change (pywikibot/core) - Pywikibot-commits

24 May 2014

jenkins-bot has submitted this change and it was merged.
Change subject: Ported unlink.py for Core branch, not using mwparserfromhell anymore Change-Id: Ic533bc9b9e1921b8ff0bcd07235492160b08a6b2
......................................................................
Ported unlink.py for Core branch,
not using mwparserfromhell anymore
Change-Id: Ic533bc9b9e1921b8ff0bcd07235492160b08a6b2
---
A scripts/unlink.py
1 file changed, 201 insertions(+), 0 deletions(-)
Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/scripts/unlink.py b/scripts/unlink.py
new file mode 100755
index 0000000..43f6723
--- /dev/null
+++ b/scripts/unlink.py
@@ -0,0 +1,201 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+This bot unlinks a page on every page that links to it.
+
+This script understands this command-line argument:
+
+    -namespace:n - Number of namespace to process. The parameter can be used
+                   multiple times. It works in combination with all other
+                   parameters, except for the -start parameter. If you e.g.
+                   want to iterate over all user pages starting at User:M, use
+                   -start:User:M.
+
+All other parameters will be regarded as part of the title of the page that
+should be unlinked.
+
+Example:
+
+python unlink.py Foo bar -namespace:0 -namespace:6
+
+    Removes links to the page [[Foo bar]] in articles and image descriptions.
+"""
+#
+# (C) Pywikibot team, 2007-2014
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+#
+
+import re
+import pywikibot
+from pywikibot import pagegenerators
+from pywikibot.editor import TextEditor
+from pywikibot import i18n
+from pywikibot import config
+
+
+class UnlinkBot:
+
+    def __init__(self, pageToUnlink, namespaces, always):
+        self.pageToUnlink = pageToUnlink
+        gen = pagegenerators.ReferringPageGenerator(pageToUnlink)
+        if namespaces != []:
+            gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
+        self.generator = pagegenerators.PreloadingGenerator(gen)
+        linktrail = pywikibot.getSite().linktrail()
+        # The regular expression which finds links. Results consist of four
+        # groups:
+        #
+        # group title is the target page title, that is, everything
+        # before | or ].
+        #
+        # group section is the page section.
+        # It'll include the # to make life easier for us.
+        #
+        # group label is the alternative link title, that's everything
+        # between | and ].
+        #
+        # group linktrail is the link trail, that's letters after ]] which are
+        # part of the word.
+        # note that the definition of 'letter' varies from language to language.
+        self.linkR = re.compile(r'[[(?P<title>[^]|#]*)(?P<section>#[^]|]*)?(|(?P<label>[^]]*))?]](?P<linktrail>%s)'
+                                % linktrail)
+        self.always = always
+        self.done = False
+        self.comment = i18n.twtranslate(pywikibot.getSite(), 'unlink-unlinking',
+                                        self.pageToUnlink.title())
+
+    def handleNextLink(self, text, match, context=100):
+        """
+        Returns a tuple (text, jumpToBeginning).
+
+        text is the unicode string after the current link has been processed.
+        jumpToBeginning is a boolean which specifies if the cursor position
+        should be reset to 0. This is required after the user has edited the
+        article.
+        """
+        # ignore interwiki links and links to sections of the same page as well
+        # as section links
+        if not match.group('title') \
+           or self.pageToUnlink.site.isInterwikiLink(match.group('title')) \
+           or match.group('section'):
+            return text, False
+        linkedPage = pywikibot.Page(self.pageToUnlink.site,
+                                    match.group('title'))
+        # Check whether the link found is to the current page itself.
+        if linkedPage != self.pageToUnlink:
+            # not a self-link
+            return text, False
+        else:
+            # at the beginning of the link, start red color.
+            # at the end of the link, reset the color to default
+            if self.always:
+                choice = 'a'
+            else:
+                pywikibot.output(
+                    text[max(0, match.start() - context):match.start()]
+                    + '\03{lightred}' + text[match.start():match.end()]
+                    + '\03{default}' + text[match.end():match.end() + context])
+                choice = pywikibot.inputChoice(
+                    u'\nWhat shall be done with this link?\n',
+                    ['unlink', 'skip', 'edit', 'more context',
+                     'unlink all', 'quit'],
+                    ['U', 's', 'e', 'm', 'a', 'q'], 'u')
+                pywikibot.output(u'')
+
+                if choice == 's':
+                    # skip this link
+                    return text, False
+                elif choice == 'e':
+                    editor = TextEditor()
+                    newText = editor.edit(text, jumpIndex=match.start())
+                    # if user didn't press Cancel
+                    if newText:
+                        return newText, True
+                    else:
+                        return text, True
+                elif choice == 'm':
+                    # show more context by recursive self-call
+                    return self.handleNextLink(text, match,
+                                               context=context + 100)
+                elif choice == 'a':
+                    self.always = True
+                elif choice == 'q':
+                    self.done = True
+                    return text, False
+            new = match.group('label') or match.group('title')
+            new += match.group('linktrail')
+            return text[:match.start()] + new + text[match.end():], False
+
+    def treat(self, page):
+        # Show the title of the page we're working on.
+        # Highlight the title in purple.
+        pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
+                         % page.title())
+        try:
+            oldText = page.get()
+            text = oldText
+            curpos = 0
+            while curpos < len(text):
+                match = self.linkR.search(text, pos=curpos)
+                if not match:
+                    break
+                # Make sure that next time around we will not find this same
+                # hit.
+                curpos = match.start() + 1
+                text, jumpToBeginning = self.handleNextLink(text, match)
+                if jumpToBeginning:
+                    curpos = 0
+            if oldText == text:
+                pywikibot.output(u'No changes necessary.')
+            else:
+                pywikibot.showDiff(oldText, text)
+                page.put(text, self.comment)
+        except pywikibot.NoPage:
+            pywikibot.output(u"Page %s does not exist?!"
+                             % page.title(asLink=True))
+        except pywikibot.IsRedirectPage:
+            pywikibot.output(u"Page %s is a redirect; skipping."
+                             % page.title(asLink=True))
+        except pywikibot.LockedPage:
+            pywikibot.output(u"Page %s is locked?!" % page.title(asLink=True))
+
+    def run(self):
+        for page in self.generator:
+            if self.done:
+                break
+            self.treat(page)
+
+
+def main():
+    # This temporary array is used to read the page title if one single
+    # page that should be unlinked.
+    pageTitle = []
+    # Which namespaces should be processed?
+    # default to [] which means all namespaces will be processed
+    namespaces = []
+    always = False
+
+    for arg in pywikibot.handleArgs():
+        if arg.startswith('-namespace:'):
+            try:
+                namespaces.append(int(arg[11:]))
+            except ValueError:
+                namespaces.append(arg[11:])
+        elif arg == '-always':
+            always = True
+        else:
+            pageTitle.append(arg)
+
+    if pageTitle:
+        page = pywikibot.Page(pywikibot.getSite(), ' '.join(pageTitle))
+        bot = UnlinkBot(page, namespaces, always)
+        bot.run()
+    else:
+        pywikibot.showHelp('unlink')
+
+if __name__ == "__main__":
+    main()
-- 
To view, visit https://gerrit.wikimedia.org/r/132910
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ic533bc9b9e1921b8ff0bcd07235492160b08a6b2
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Maximilianklein isalix@gmail.com
Gerrit-Reviewer: Ladsgroup ladsgroup@gmail.com
Gerrit-Reviewer: Legoktm legoktm.wikipedia@gmail.com
Gerrit-Reviewer: Maximilianklein isalix@gmail.com
Gerrit-Reviewer: Merlijn van Deen valhallasw@arctus.nl
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot <>