[Pywikipedia-svn] SVN: [7961] trunk/pywikipedia/wikipedia.py

25 Feb 2010

Revision: 7961
Author:   xqt
Date:     2010-02-25 13:42:02 +0000 (Thu, 25 Feb 2010)
Log Message:
-----------
activate wikipedia library
Modified Paths:
--------------
    trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================

--- trunk/pywikipedia/wikipedia.py	2010-02-25 10:28:15 UTC (rev 7960)
+++ trunk/pywikipedia/wikipedia.py	2010-02-25 13:42:02 UTC (rev 7961)
@@ -139,6 +139,8 @@
 import xmlreader
 from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, SoupStrainer
 import weakref
+# Splitting the bot into library parts
+from pywikibot import *
# Set the locale to system default. This will ensure correct string
 # handling for non-latin characters on Python 2.3.x. For Python 2.4.x it's no
@@ -161,78 +163,6 @@
     WIDEBUILD = False
-# Local exceptions
-
-class Error(Exception):
-    """Wikipedia error"""
-
-class NoUsername(Error):
-    """Username is not in user-config.py"""
-
-class NoPage(Error):
-    """Page does not exist"""
-
-class NoSuchSite(Error):
-    """Site does not exist"""
-
-class IsRedirectPage(Error):
-    """Page is a redirect page"""
-
-class IsNotRedirectPage(Error):
-    """Page is not a redirect page"""
-
-class InvalidTitle(Error):
-    """Invalid page title"""
-
-class LockedPage(Error):
-    """Page is locked"""
-
-class SectionError(Error):
-    """The section specified by # does not exist"""
-
-class PageNotSaved(Error):
-    """Saving the page has failed"""
-
-class EditConflict(PageNotSaved):
-    """There has been an edit conflict while uploading the page"""
-
-class SpamfilterError(PageNotSaved):
-    """Saving the page has failed because the MediaWiki spam filter detected a blacklisted URL."""
-    def __init__(self, arg):
-        self.url = arg
-        self.args = arg,
-
-class LongPageError(PageNotSaved):
-    """Saving the page has failed because it is too long."""
-    def __init__(self, arg, arg2):
-        self.length = arg
-        self.limit = arg2,
-
-class MaxTriesExceededError(PageNotSaved):
-    """Saving the page has failed because the maximum number of attempts has been reached"""
-
-class ServerError(Error):
-    """Got unexpected server response"""
-
-class BadTitle(Error):
-    """Server responded with BadTitle."""
-
-# UserBlocked exceptions should in general not be caught. If the bot has
-# been blocked, the bot operator should address the reason for the block
-# before continuing.
-class UserBlocked(Error):
-    """Your username or IP has been blocked"""
-
-class PageNotFound(Error):
-    """Page not found in list"""
-
-class CaptchaError(Error):
-    """Captcha is asked and config.solve_captcha == False."""
-
-class NoHash(Error):
-    """ The APIs don't return any Hash for the image searched.
-        Really Strange, better to raise an error. """
-
 SaxError = xml.sax._exceptions.SAXParseException
# Pre-compile re expressions
@@ -4172,17 +4102,6 @@
# Library functions
-def unescape(s):
-    """Replace escaped HTML-special characters by their originals"""
-    if '&' not in s:
-        return s
-    s = s.replace("&lt;", "<")
-    s = s.replace("&gt;", ">")
-    s = s.replace("&apos;", "'")
-    s = s.replace("&quot;", '"')
-    s = s.replace("&amp;", "&") # Must be last
-    return s
-
 def setAction(s):
     """Set a summary to use for changed page submissions"""
     global action
@@ -4367,628 +4286,6 @@
         finally:
             self.lock.release()
-# functions to manipulate wikitext strings (by default, all text arguments
-# should be Unicode)
-# All return the modified text as a unicode object
-
-def replaceExcept(text, old, new, exceptions, caseInsensitive=False,
-                  allowoverlap=False, marker = '', site = None):
-    """
-    Return text with 'old' replaced by 'new', ignoring specified types of text.
-
-    Skips occurences of 'old' within exceptions; e.g., within nowiki tags or
-    HTML comments. If caseInsensitive is true, then use case insensitive
-    regex matching. If allowoverlap is true, overlapping occurences are all
-    replaced (watch out when using this, it might lead to infinite loops!).
-
-    Parameters:
-        text            - a unicode string
-        old             - a compiled regular expression
-        new             - a unicode string (which can contain regular
-                          expression references), or a function which takes
-                          a match object as parameter. See parameter repl of
-                          re.sub().
-        exceptions      - a list of strings which signal what to leave out,
-                          e.g. ['math', 'table', 'template']
-        caseInsensitive - a boolean
-        marker          - a string that will be added to the last replacement;
-                          if nothing is changed, it is added at the end
-
-    """
-    if site is None:
-        site = getSite()
-
-    exceptionRegexes = {
-        'comment':     re.compile(r'(?s)<!--.*?-->'),
-        # section headers
-        'header':      re.compile(r'\r\n=+.+=+ *\r\n'),
-        'includeonly': re.compile(r'(?is)<includeonly>.*?</includeonly>'),
-        'math':        re.compile(r'(?is)<math>.*?</math>'),
-        'noinclude':   re.compile(r'(?is)<noinclude>.*?</noinclude>'),
-        # wiki tags are ignored inside nowiki tags.
-        'nowiki':      re.compile(r'(?is)<nowiki>.*?</nowiki>'),
-        # preformatted text
-        'pre':         re.compile(r'(?ism)<pre>.*?</pre>'),
-        'source':      re.compile(r'(?is)<source .*?</source>'),
-        # inline references
-        'ref':         re.compile(r'(?ism)<ref[ >].*?</ref>'),
-        'timeline':    re.compile(r'(?is)<timeline>.*?</timeline>'),
-        # lines that start with a space are shown in a monospace font and
-        # have whitespace preserved.
-        'startspace':  re.compile(r'(?m)^ (.*?)$'),
-        # tables often have whitespace that is used to improve wiki
-        # source code readability.
-        # TODO: handle nested tables.
-        'table':       re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'),
-        # templates with parameters often have whitespace that is used to
-        # improve wiki source code readability.
-        # 'template':    re.compile(r'(?s){{.*?}}'),
-        # The regex above fails on nested templates. This regex can handle
-        # templates cascaded up to level 3, but no deeper. For arbitrary
-        # depth, we'd need recursion which can't be done in Python's re.
-        # After all, the language of correct parenthesis words is not regular.
-        'template':    re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'),
-        'hyperlink':   compileLinkR(),
-        'gallery':     re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
-        # this matches internal wikilinks, but also interwiki, categories, and
-        # images.
-        'link':        re.compile(r'[[[^]|]*(|[^]]*)?]]'),
-        'interwiki':   re.compile(r'(?i)[[(%s)\s?:[^]]*]][\s]*'
-                               % '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())),
-
-    }
-
-    # if we got a string, compile it as a regular expression
-    if type(old) in  [str, unicode]:
-        if caseInsensitive:
-            old = re.compile(old, re.IGNORECASE | re.UNICODE)
-        else:
-            old = re.compile(old)
-
-    dontTouchRegexes = []
-    for exc in exceptions:
-        if isinstance(exc, str) or isinstance(exc, unicode):
-            # assume it's a reference to the exceptionRegexes dictionary
-            # defined above.
-            if exc not in exceptionRegexes:
-                raise ValueError("Unknown tag type: " + exc)
-            dontTouchRegexes.append(exceptionRegexes[exc])
-            # handle alias
-            if exc == 'source':
-                dontTouchRegexes.append(re.compile(r'(?is)<syntaxhighlight .*?</syntaxhighlight>'))
-        else:
-            # assume it's a regular expression
-            dontTouchRegexes.append(exc)
-    index = 0
-    markerpos = len(text)
-    while True:
-        match = old.search(text, index)
-        if not match:
-            # nothing left to replace
-            break
-
-        # check which exception will occur next.
-        nextExceptionMatch = None
-        for dontTouchR in dontTouchRegexes:
-            excMatch = dontTouchR.search(text, index)
-            if excMatch and (
-                    nextExceptionMatch is None or
-                    excMatch.start() < nextExceptionMatch.start()):
-                nextExceptionMatch = excMatch
-
-        if nextExceptionMatch is not None and nextExceptionMatch.start() <= match.start():
-            # an HTML comment or text in nowiki tags stands before the next valid match. Skip.
-            index = nextExceptionMatch.end()
-        else:
-            # We found a valid match. Replace it.
-            if callable(new):
-                # the parameter new can be a function which takes the match as a parameter.
-                replacement = new(match)
-            else:
-                # it is not a function, but a string.
-
-                # it is a little hack to make \n work. It would be better to fix it
-                # previously, but better than nothing.
-                new = new.replace('\n', '\n')
-
-                # We cannot just insert the new string, as it may contain regex
-                # group references such as \2 or \g<name>.
-                # On the other hand, this approach does not work because it can't
-                # handle lookahead or lookbehind (see bug #1731008):
-                #replacement = old.sub(new, text[match.start():match.end()])
-                #text = text[:match.start()] + replacement + text[match.end():]
-
-                # So we have to process the group references manually.
-                replacement = new
-
-                groupR = re.compile(r'\(?P<number>\d+)|\g<(?P<name>.+?)>')
-                while True:
-                    groupMatch = groupR.search(replacement)
-                    if not groupMatch:
-                        break
-                    groupID = groupMatch.group('name') or int(groupMatch.group('number'))
-                    replacement = replacement[:groupMatch.start()] + match.group(groupID) + replacement[groupMatch.end():]
-            text = text[:match.start()] + replacement + text[match.end():]
-
-            # continue the search on the remaining text
-            if allowoverlap:
-                index = match.start() + 1
-            else:
-                index = match.start() + len(replacement)
-            markerpos = match.start() + len(replacement)
-    text = text[:markerpos] + marker + text[markerpos:]
-    return text
-
-def removeDisabledParts(text, tags = ['*']):
-    """
-    Return text without portions where wiki markup is disabled
-
-    Parts that can/will be removed are --
-    * HTML comments
-    * nowiki tags
-    * pre tags
-    * includeonly tags
-
-    The exact set of parts which should be removed can be passed as the
-    'parts' parameter, which defaults to all.
-    """
-    regexes = {
-            'comments' :   r'<!--.*?-->',
-            'includeonly': r'<includeonly>.*?</includeonly>',
-            'nowiki':      r'<nowiki>.*?</nowiki>',
-            'pre':         r'<pre>.*?</pre>',
-            'source':      r'<source .*?</source>',
-    }
-    if '*' in tags:
-        tags = regexes.keys()
-    toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]),
-                           re.IGNORECASE | re.DOTALL)
-    return toRemoveR.sub('', text)
-
-def isDisabled(text, index, tags = ['*']):
-    """
-    Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
-
-    For the tags parameter, see removeDisabledParts() above.
-    """
-    # Find a marker that is not already in the text.
-    marker = findmarker(text, '@@', '@')
-    text = text[:index] + marker + text[index:]
-    text = removeDisabledParts(text, tags)
-    return (marker not in text)
-
-def findmarker(text, startwith = u'@', append = u'@'):
-    # find a string which is not part of text
-    if len(append) <= 0:
-        append = u'@'
-    mymarker = startwith
-    while mymarker in text:
-        mymarker += append
-    return mymarker
-
-def expandmarker(text, marker = '', separator = ''):
-    # set to remove any number of separator occurrences plus arbitrary
-    # whitespace before, after, and between them,
-    # by allowing to include them into marker.
-    if separator:
-        firstinmarker = text.find(marker)
-        firstinseparator = firstinmarker
-        lenseparator = len(separator)
-        striploopcontinue = True
-        while firstinseparator > 0 and striploopcontinue:
-            striploopcontinue = False
-            if (firstinseparator >= lenseparator) and (separator == text[firstinseparator-lenseparator:firstinseparator]):
-                firstinseparator -= lenseparator
-                striploopcontinue = True
-            elif text[firstinseparator-1] < ' ':
-                firstinseparator -= 1
-                striploopcontinue = True
-        marker = text[firstinseparator:firstinmarker] + marker
-    return marker
-
-# Part of library dealing with interwiki language links
-
-# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
-#        interproject.  These functions only deal with links to a
-#        corresponding page in another language on the same project (e.g.,
-#        Wikipedia, Wiktionary, etc.) in another language. They do not find
-#        or change links to a different project, or any that are formatted
-#        as in-line interwiki links (e.g., "[[:es:Articulo]]".  (CONFIRM)
-
-def getLanguageLinks(text, insite = None, pageLink="[[]]", template_subpage=False):
-    """
-    Return a dict of interlanguage links found in text.
-
-    Dict uses language codes as keys and Page objects as values.
-    Do not call this routine directly, use Page.interwiki() method
-    instead.
-
-    """
-    if insite is None:
-        insite = getSite()
-    result = {}
-    # Ignore interwiki links within nowiki tags, includeonly tags, pre tags,
-    # and HTML comments
-    tags = ['comments', 'nowiki', 'pre', 'source']
-    if not template_subpage:
-        tags += ['includeonly']
-    text = removeDisabledParts(text, tags)
-
-    # This regular expression will find every link that is possibly an
-    # interwiki link.
-    # NOTE: language codes are case-insensitive and only consist of basic latin
-    # letters and hyphens.
-    interwikiR = re.compile(r'[[([a-zA-Z-]+)\s?:([^[]\n]*)]]')
-    for lang, pagetitle in interwikiR.findall(text):
-        lang = lang.lower()
-        # Check if it really is in fact an interwiki link to a known
-        # language, or if it's e.g. a category tag or an internal link
-        if lang in insite.family.obsolete:
-            lang = insite.family.obsolete[lang]
-        if lang in insite.validLanguageLinks():
-            if '|' in pagetitle:
-                # ignore text after the pipe
-                pagetitle = pagetitle[:pagetitle.index('|')]
-            # we want the actual page objects rather than the titles
-            site = insite.getSite(code = lang)
-            try:
-                result[site] = Page(site, pagetitle, insite = insite)
-            except InvalidTitle:
-                output(
-        u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
-                           % (lang, pagetitle))
-                continue
-    return result
-
-def removeLanguageLinks(text, site = None, marker = ''):
-    """Return text with all interlanguage links removed.
-
-    If a link to an unknown language is encountered, a warning is printed.
-    If a marker is defined, that string is placed at the location of the
-    last occurence of an interwiki link (at the end if there are no
-    interwiki links).
-
-    """
-    if site is None:
-        site = getSite()
-    if not site.validLanguageLinks():
-        return text
-    # This regular expression will find every interwiki link, plus trailing
-    # whitespace.
-    languages = '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())
-    interwikiR = re.compile(r'[[(%s)\s?:[^]]*]][\s]*'
-                            % languages, re.IGNORECASE)
-    text = replaceExcept(text, interwikiR, '',
-                         ['nowiki', 'comment', 'math', 'pre', 'source'], marker=marker)
-    return text.strip()
-
-def removeLanguageLinksAndSeparator(text, site = None, marker = '', separator = ''):
-    """Return text with all interlanguage links, plus any preceeding whitespace
-       and separateor occurrences removed.
-
-    If a link to an unknown language is encountered, a warning is printed.
-    If a marker is defined, that string is placed at the location of the
-    last occurence of an interwiki link (at the end if there are no
-    interwiki links).
-
-    """
-    if separator:
-        mymarker = findmarker(text, u'@L@')
-        newtext = removeLanguageLinks(text, site, mymarker)
-        mymarker = expandmarker(newtext, mymarker, separator)
-        return newtext.replace(mymarker, marker)
-    else:
-        return removeLanguageLinks(text, site, marker)
-
-def replaceLanguageLinks(oldtext, new, site = None, addOnly = False, template = False, template_subpage = False):
-    """Replace interlanguage links in the text with a new set of links.
-
-    'new' should be a dict with the Site objects as keys, and Page objects
-    as values (i.e., just like the dict returned by getLanguageLinks
-    function).
-    """
-    # Find a marker that is not already in the text.
-    marker = findmarker( oldtext, u'@@')
-    if site is None:
-        site = getSite()
-    separator = site.family.interwiki_text_separator
-    cseparator = site.family.category_text_separator
-    separatorstripped = separator.strip()
-    cseparatorstripped = cseparator.strip()
-    if addOnly:
-        s2 = oldtext
-    else:
-        s2 = removeLanguageLinksAndSeparator(oldtext, site = site, marker = marker, separator = separatorstripped)
-    s = interwikiFormat(new, insite = site)
-    if s:
-        if site.language() in site.family.interwiki_attop:
-            newtext = s + separator + s2.replace(marker,'').strip()
-        else:
-            # calculate what was after the language links on the page
-            firstafter = s2.find(marker)
-            if firstafter < 0:
-                firstafter = len(s2)
-            else:
-                firstafter += len(marker)
-            # Is there any text in the 'after' part that means we should keep it after?
-            if "</noinclude>" in s2[firstafter:]:
-                if separatorstripped:
-                    s = separator + s
-                newtext = s2[:firstafter].replace(marker,'') + s + s2[firstafter:]
-            elif site.language() in site.family.categories_last:
-                cats = getCategoryLinks(s2, site = site)
-                s2 = removeCategoryLinksAndSeparator(s2.replace(marker,'',cseparatorstripped).strip(), site) + separator + s
-                newtext = replaceCategoryLinks(s2, cats, site=site, addOnly=True)
-            elif site.family.name == 'wikitravel':     # for Wikitravel's language links position.
-                s = separator + s + separator
-                newtext = s2[:firstafter].replace(marker,'') + s + s2[firstafter:]
-            else:
-                if template or template_subpage:
-                    if template_subpage:
-                        includeOn  = '<includeonly>'
-                        includeOff = '</includeonly>'
-                    else:
-                        includeOn  = '<noinclude>'
-                        includeOff = '</noinclude>'
-                        separator = ''
-                    # Do we have a noinclude at the end of the template?
-                    parts = s2.split(includeOff)
-                    lastpart = parts[-1]
-                    if re.match('\s*%s' % marker, lastpart):
-                        # Put the langlinks back into the noinclude's
-                        regexp = re.compile('%s\s*%s' % (includeOff, marker))
-                        newtext = regexp.sub(s + includeOff, s2)
-                    else:
-                        # Put the langlinks at the end, inside noinclude's
-                        newtext = s2.replace(marker,'').strip() + separator + u'%s\n%s%s\n' % (includeOn, s, includeOff)
-                else:
-                    newtext = s2.replace(marker,'').strip() + separator + s
-    else:
-        newtext = s2.replace(marker,'')
-    return newtext
-
-def interwikiFormat(links, insite = None):
-    """Convert interwiki link dict into a wikitext string.
-
-    'links' should be a dict with the Site objects as keys, and Page
-    objects as values.
-
-    Return a unicode string that is formatted for inclusion in insite
-    (defaulting to the current site).
-    """
-    if insite is None:
-        insite = getSite()
-    if not links:
-        return ''
-
-    ar = interwikiSort(links.keys(), insite)
-    s = []
-    for site in ar:
-        try:
-            link = links[site].aslink(forceInterwiki=True).replace('[[:', '[[')
-            s.append(link)
-        except AttributeError:
-            s.append(getSite(site).linkto(links[site], othersite=insite))
-    if insite.lang in insite.family.interwiki_on_one_line:
-        sep = u' '
-    else:
-        sep = u'\r\n'
-    s=sep.join(s) + u'\r\n'
-    return s
-
-# Sort sites according to local interwiki sort logic
-def interwikiSort(sites, insite = None):
-    if insite is None:
-      insite = getSite()
-    if not sites:
-      return []
-
-    sites.sort()
-    putfirst = insite.interwiki_putfirst()
-    if putfirst:
-        #In this case I might have to change the order
-        firstsites = []
-        for code in putfirst:
-            # The code may not exist in this family?
-            if code in insite.family.obsolete:
-                code = insite.family.obsolete[code]
-            if code in insite.validLanguageLinks():
-                site = insite.getSite(code = code)
-                if site in sites:
-                    del sites[sites.index(site)]
-                    firstsites = firstsites + [site]
-        sites = firstsites + sites
-    if insite.interwiki_putfirst_doubled(sites): #some implementations return False
-        sites = insite.interwiki_putfirst_doubled(sites) + sites
-
-    return sites
-
-# Wikitext manipulation functions dealing with category links
-def getCategoryLinks(text, site):
-    import catlib
-    """Return a list of category links found in text.
-
-    List contains Category objects.
-    Do not call this routine directly, use Page.categories() instead.
-
-    """
-    result = []
-    # Ignore category links within nowiki tags, pre tags, includeonly tags,
-    # and HTML comments
-    text = removeDisabledParts(text)
-    catNamespace = '|'.join(site.category_namespaces())
-    R = re.compile(r'[[\s*(?P<namespace>%s)\s*:\s*(?P<catName>.+?)(?:|(?P<sortKey>.+?))?\s*]]' % catNamespace, re.I)
-    for match in R.finditer(text):
-        cat = catlib.Category(site, '%s:%s' % (match.group('namespace'), match.group('catName')), sortKey = match.group('sortKey'))
-        result.append(cat)
-    return result
-
-def removeCategoryLinks(text, site, marker = ''):
-    """Return text with all category links removed.
-
-    Put the string marker after the last replacement (at the end of the text
-    if there is no replacement).
-
-    """
-    # This regular expression will find every link that is possibly an
-    # interwiki link, plus trailing whitespace. The language code is grouped.
-    # NOTE: This assumes that language codes only consist of non-capital
-    # ASCII letters and hyphens.
-    catNamespace = '|'.join(site.category_namespaces())
-    categoryR = re.compile(r'[[\s*(%s)\s*:.*?]]\s*' % catNamespace, re.I)
-    text = replaceExcept(text, categoryR, '', ['nowiki', 'comment', 'math', 'pre', 'source'], marker = marker)
-    if marker:
-        #avoid having multiple linefeeds at the end of the text
-        text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker, text.strip())
-    return text.strip()
-
-def removeCategoryLinksAndSeparator(text, site = None, marker = '', separator = ''):
-    """Return text with all category links, plus any preceeding whitespace
-       and separateor occurrences removed.
-
-    Put the string marker after the last replacement (at the end of the text
-    if there is no replacement).
-
-    """
-    if separator:
-        mymarker = findmarker(text, u'@C@')
-        newtext = removeCategoryLinks(text, site, mymarker)
-        mymarker = expandmarker(newtext, mymarker, separator)
-        return newtext.replace(mymarker, marker)
-    else:
-        return removeCategoryLinks(text, site, marker)
-
-def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None):
-    """Replace the category oldcat with the category newcat and return
-       the modified text.
-
-    """
-    if site is None:
-        site = getSite()
-
-    catNamespace = '|'.join(site.category_namespaces())
-    title = oldcat.titleWithoutNamespace()
-    if not title:
-        return
-    # title might contain regex special characters
-    title = re.escape(title)
-    # title might not be capitalized correctly on the wiki
-    if title[0].isalpha() and not site.nocapitalize:
-        title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:]
-    # spaces and underscores in page titles are interchangeable, and collapsible
-    title = title.replace(r"\ ", "[ _]+").replace(r"_", "[ _]+")
-    categoryR = re.compile(r'[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])'
-                            % (catNamespace, title), re.I)
-    if newcat is None:
-        text = replaceExcept(oldtext, categoryR, '',
-                             ['nowiki', 'comment', 'math', 'pre', 'source'])
-    else:
-        text = replaceExcept(oldtext, categoryR,
-                             '[[%s:%s\2' % (site.namespace(14),
-                                             newcat.titleWithoutNamespace()),
-                             ['nowiki', 'comment', 'math', 'pre', 'source'])
-    return text
-
-def replaceCategoryLinks(oldtext, new, site = None, addOnly = False):
-    """Replace the category links given in the wikitext given
-       in oldtext by the new links given in new.
-
-       'new' should be a list of Category objects.
-
-       If addOnly is True, the old category won't be deleted and
-       the category(s) given will be added
-       (and so they won't replace anything).
-    """
-
-    # Find a marker that is not already in the text.
-    marker = findmarker( oldtext, u'@@')
-    if site is None:
-        site = getSite()
-    if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext:
-        raise Error('The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia on pages that contain the person data template because of the non-standard placement of that template. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006#...')
-    separator = site.family.category_text_separator
-    iseparator = site.family.interwiki_text_separator
-    separatorstripped = separator.strip()
-    iseparatorstripped = iseparator.strip()
-    if addOnly:
-        s2 = oldtext
-    else:
-        s2 = removeCategoryLinksAndSeparator(oldtext, site = site, marker = marker, separator = separatorstripped)
-    s = categoryFormat(new, insite = site)
-    if s:
-        if site.language() in site.family.category_attop:
-            newtext = s + separator + s2
-        else:
-            # calculate what was after the categories links on the page
-            firstafter = s2.find(marker)
-            if firstafter < 0:
-                firstafter = len(s2)
-            else:
-                firstafter += len(marker)
-            # Is there any text in the 'after' part that means we should keep it after?
-            if "</noinclude>" in s2[firstafter:]:
-                if separatorstripped:
-                    s = separator + s
-                newtext = s2[:firstafter].replace(marker,'') + s + s2[firstafter:]
-            elif site.language() in site.family.categories_last:
-                newtext = s2.replace(marker,'').strip() + separator + s
-            else:
-                interwiki = getLanguageLinks(s2)
-                s2 = removeLanguageLinksAndSeparator(s2.replace(marker,''), site, '', iseparatorstripped) + separator + s
-                newtext = replaceLanguageLinks(s2, interwiki, site = site, addOnly = True)
-    else:
-        newtext = s2.replace(marker,'')
-    return newtext.strip()
-
-def categoryFormat(categories, insite = None):
-    """Return a string containing links to all categories in a list.
-
-    'categories' should be a list of Category objects.
-
-    The string is formatted for inclusion in insite.
-    """
-    if not categories:
-        return ''
-    if insite is None:
-        insite = getSite()
-    catLinks = [category.aslink(noInterwiki = True) for category in categories]
-    if insite.category_on_one_line():
-        sep = ' '
-    else:
-        sep = '\r\n'
-    # Some people don't like the categories sorted
-    #catLinks.sort()
-    return sep.join(catLinks) + '\r\n'
-
-def compileLinkR(withoutBracketed=False, onlyBracketed=False):
-    """Return a regex that matches external links."""
-    # RFC 2396 says that URLs may only contain certain characters.
-    # For this regex we also accept non-allowed characters, so that the bot
-    # will later show these links as broken ('Non-ASCII Characters in URL').
-    # Note: While allowing parenthesis inside URLs, MediaWiki will regard
-    # right parenthesis at the end of the URL as not part of that URL.
-    # The same applies to dot, comma, colon and some other characters.
-    notAtEnd = ']\s).:;,<>"'
-    # So characters inside the URL can be anything except whitespace,
-    # closing squared brackets, quotation marks, greater than and less
-    # than, and the last character also can't be parenthesis or another
-    # character disallowed by MediaWiki.
-    notInside = ']\s<>"'
-    # The first half of this regular expression is required because '' is
-    # not allowed inside links. For example, in this wiki text:
-    #       ''Please see http://www.example.org.''
-    # .'' shouldn't be considered as part of the link.
-    regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \
-            + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside \
-            + ']*[^' + notAtEnd + '])'
-
-    if withoutBracketed:
-        regex = r'(?<![)' + regex
-    elif onlyBracketed:
-        regex = r'[' + regex
-    linkR = re.compile(regex)
-    return linkR
-
 # end of category specific code
 def url2link(percentname, insite, site):
     """Convert urlname of a wiki page into interwiki link format.
@@ -8130,247 +7427,6 @@
 # Set socket timeout
 socket.setdefaulttimeout(config.socket_timeout)
-# Languages to use for comment text after the actual language but before
-# en:. For example, if for language 'xx', you want the preference of
-# languages to be:
-# xx:, then fr:, then ru:, then en:
-# you let altlang return ['fr','ru'].
-# This code is used by translate() below.
-
-def altlang(code):
-    #Amharic
-    if code in ['aa', 'om']:
-        return ['am']
-    #Arab
-    if code in ['arc', 'arz']:
-        return ['ar']
-    if code == 'kab':
-        return ['ar', 'fr']
-    #Bulgarian
-    if code in ['cu', 'mk']:
-        return ['bg', 'sr', 'sh']
-    #Czech
-    if code in ['cs', 'sk']:
-        return ['cs', 'sk']
-    #German
-    if code in ['bar', 'ksh', 'pdc']:
-        return ['de']
-    if code in ['als', 'lb']:
-        return ['de', 'fr']
-    if code == 'nds':
-        return ['nds-nl', 'de']
-    if code in ['dsb', 'hsb']:
-        return ['hsb', 'dsb', 'de']
-    if code == 'rm':
-        return ['de', 'it']
-    if code == 'stq':
-        return ['fy', 'de']
-    #Greek
-    if code == 'pnt':
-        return ['el']
-    #Esperanto
-    if code in ['io', 'nov']:
-        return ['eo']
-    #Spanish
-    if code in ['an', 'ast', 'ay', 'ca', 'ext', 'lad', 'nah', 'nv', 'qu']:
-        return ['es']
-    if code in ['gl', 'gn']:
-        return ['es', 'pt']
-    if code == ['eu']:
-        return ['es', 'fr']
-    if code in ['bcl', 'cbk-zam', 'ceb', 'ilo', 'pag', 'pam', 'tl', 'war']:
-        return ['es', 'tl']
-    #Estonian
-    if code == 'fiu-vro':
-        return ['et']
-    #Persian (Farsi)
-    if code in ['glk', 'mzn']:
-        return ['ar']
-    #French
-    if code in ['bm', 'br', 'ht', 'kab', 'kg', 'ln', 'mg', 'nrm', 'oc',
-                'pcd', 'rw', 'sg', 'ty', 'wa']:
-        return ['fr']
-    if code == 'co':
-        return ['fr', 'it']
-    #Hindi
-    if code in ['bh', 'pi', 'sa']:
-        return ['hi']
-    if code in ['ne', 'new']:
-        return ['ne', 'new', 'hi']
-    #Indonesian and Malay
-    if code in ['ace', 'bug', 'id', 'jv', 'ms', 'su']:
-        return ['id', 'ms', 'jv']
-    if code == 'map-bms':
-        return ['jv', 'id', 'ms']
-    #Inuit languages
-    if code in ['ik', 'iu']:
-        return ['iu', 'kl']
-    if code == 'kl':
-        return ['iu', 'da', 'no']
-    #Italian
-    if code in ['eml', 'fur', 'lij', 'lmo', 'nap', 'pms', 'roa-tara', 'sc',
-                'scn', 'vec']:
-        return ['it']
-    if code == 'frp':
-        return ['it', 'fr']
-    #Lithuanian
-    if code in ['bat-smg', 'ltg']:
-        return ['lt']
-    #Dutch
-    if code in ['fy', 'li', 'pap', 'srn', 'vls', 'zea']:
-        return ['nl']
-    if code == ['nds-nl']:
-        return ['nds', 'nl']
-    #Polish
-    if code in ['csb', 'szl']:
-        return ['pl']
-    #Portuguese
-    if code in ['fab', 'mwl', 'tet']:
-        return ['pt']
-    #Romanian
-    if code in ['mo', 'roa-rup']:
-        return ['ro']
-    #Russian and Belarusian
-    if code in ['ab', 'av', 'ba', 'bxr', 'ce', 'cv', 'kk', 'ky', 'lbe', 'mdf',
-                'mhr', 'myv', 'os', 'sah', 'tg', 'tt', 'udm', 'uk', 'xal']:
-        return ['ru']
-    if code in ['be', 'be-x-old']:
-        return ['be', 'be-x-old', 'ru']
-    if code == 'kaa':
-        return ['uz', 'ru']
-    #Serbocroatian
-    if code in ['bs', 'hr', 'sh', 'sr']:
-        return ['sh', 'hr', 'bs', 'sr']
-    #Turkish and Kurdish
-    if code in ['diq', 'ku']:
-        return ['ku', 'tr']
-    if code == 'ckb':
-        return ['ku', 'ar']
-    #Chinese
-    if code in ['minnan', 'zh', 'zh-classical', 'zh-min-nan', 'zh-tw', 'zh-hans', 'zh-hant']:
-        return ['zh', 'zh-tw', 'zh-cn', 'zh-classical']
-    if code in ['cdo', 'gan', 'hak', 'ii', 'wuu', 'za', 'zh-cdo', 'zh-classical',
-                'zh-cn', 'zh-yue']:
-        return ['zh', 'zh-cn', 'zh-tw', 'zh-classical']
-    #Scandinavian languages
-    if code in ['da', 'sv']:
-        return ['da', 'no', 'nb', 'sv', 'nn']
-    if code in ['fo', 'is']:
-        return ['da', 'no', 'nb', 'nn', 'sv']
-    if code == 'nn':
-        return ['no', 'nb', 'sv', 'da']
-    if code in ['nb', 'no']:
-        return ['no', 'nb', 'da', 'nn', 'sv']
-    if code == 'se':
-        return ['sv', 'no', 'nb', 'nn', 'fi']
-    #Other languages
-    if code in ['bi', 'tpi']:
-        return ['bi', 'tpi']
-    if code == 'yi':
-        return ['he', 'de']
-    if code in ['ia', 'ie']:
-        return ['ia', 'la', 'it', 'fr', 'es']
-    #Default value
-    return []
-
-def translate(code, xdict):
-    """Return the most appropriate translation from a translation dict.
-
-    Given a language code and a dictionary, returns the dictionary's value for
-    key 'code' if this key exists; otherwise tries to return a value for an
-    alternative language that is most applicable to use on the Wikipedia in
-    language 'code'.
-
-    The language itself is always checked first, then languages that
-    have been defined to be alternatives, and finally English. If none of
-    the options gives result, we just take the first language in the
-    list.
-
-    """
-    # If a site is given instead of a code, use its language
-    if hasattr(code,'lang'):
-        code = code.lang
-
-    if 'wikipedia' in xdict: # If xdict attribute is wikipedia, define the xdite had multiple projects
-        if default_family in xdict:
-            xdict = xdict[default_family]
-        else:
-            xdict = xdict['wikipedia']
-
-        if type(xdict) != dict:
-            return xdict
-
-    if code in xdict:
-        return xdict[code]
-    for alt in altlang(code):
-        if alt in xdict:
-            return xdict[alt]
-    if '_default' in xdict:
-        return xdict['_default']
-    elif 'en' in xdict:
-        return xdict['en']
-    return xdict.values()[0]
-
-def showDiff(oldtext, newtext):
-    """
-    Prints a string showing the differences between oldtext and newtext.
-    The differences are highlighted (only on Unix systems) to show which
-    changes were made.
-    """
-    # For information on difflib, see http://docs.python.org/library/difflib.html
-    color = {
-        '+': 'lightgreen',
-        '-': 'lightred',
-    }
-    diff = u''
-    colors = []
-    # This will store the last line beginning with + or -.
-    lastline = None
-    # For testing purposes only: show original, uncolored diff
-    #     for line in difflib.ndiff(oldtext.splitlines(), newtext.splitlines()):
-    #         print line
-    for line in difflib.ndiff(oldtext.splitlines(), newtext.splitlines()):
-        if line.startswith('?'):
-            # initialize color vector with None, which means default color
-            lastcolors = [None for c in lastline]
-            # colorize the + or - sign
-            lastcolors[0] = color[lastline[0]]
-            # colorize changed parts in red or green
-            for i in range(min(len(line), len(lastline))):
-                if line[i] != ' ':
-                    lastcolors[i] = color[lastline[0]]
-            diff += lastline + '\n'
-            # append one None (default color) for the newline character
-            colors += lastcolors + [None]
-        elif lastline:
-            diff += lastline + '\n'
-            # colorize the + or - sign only
-            lastcolors = [None for c in lastline]
-            lastcolors[0] = color[lastline[0]]
-            colors += lastcolors + [None]
-        lastline = None
-        if line[0] in ('+', '-'):
-            lastline = line
-    # there might be one + or - line left that wasn't followed by a ? line.
-    if lastline:
-        diff += lastline + '\n'
-        # colorize the + or - sign only
-        lastcolors = [None for c in lastline]
-        lastcolors[0] = color[lastline[0]]
-        colors += lastcolors + [None]
-
-    result = u''
-    lastcolor = None
-    for i in range(len(diff)):
-        if colors[i] != lastcolor:
-            if lastcolor is None:
-                result += '\03{%s}' % colors[i]
-            else:
-                result += '\03{default}'
-        lastcolor = colors[i]
-        result += diff[i]
-    output(result)
-
 def writeToCommandLogFile():
     """
     Save the name of the called module along with all parameters to
@@ -8696,7 +7752,7 @@
     output( u'ERROR: %s caused error %s. Dump %s created.' % (name,error,filename) )
get_throttle = Throttle(config.minthrottle,config.maxthrottle)
-put_throttle = Throttle(config.put_throttle,config.put_throttle,False)
+put_throttle = Throttle(config.put_throttle,config.put_throttle,multiplydelay=False)
def decompress_gzip(data):
     # Use cStringIO if available

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Pywikipedia-svn] SVN: [7961] trunk/pywikipedia/wikipedia.py