Revision: 7961 Author: xqt Date: 2010-02-25 13:42:02 +0000 (Thu, 25 Feb 2010)
Log Message: ----------- activate wikipedia library
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2010-02-25 10:28:15 UTC (rev 7960) +++ trunk/pywikipedia/wikipedia.py 2010-02-25 13:42:02 UTC (rev 7961) @@ -139,6 +139,8 @@ import xmlreader from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, SoupStrainer import weakref +# Splitting the bot into library parts +from pywikibot import *
# Set the locale to system default. This will ensure correct string # handling for non-latin characters on Python 2.3.x. For Python 2.4.x it's no @@ -161,78 +163,6 @@ WIDEBUILD = False
-# Local exceptions - -class Error(Exception): - """Wikipedia error""" - -class NoUsername(Error): - """Username is not in user-config.py""" - -class NoPage(Error): - """Page does not exist""" - -class NoSuchSite(Error): - """Site does not exist""" - -class IsRedirectPage(Error): - """Page is a redirect page""" - -class IsNotRedirectPage(Error): - """Page is not a redirect page""" - -class InvalidTitle(Error): - """Invalid page title""" - -class LockedPage(Error): - """Page is locked""" - -class SectionError(Error): - """The section specified by # does not exist""" - -class PageNotSaved(Error): - """Saving the page has failed""" - -class EditConflict(PageNotSaved): - """There has been an edit conflict while uploading the page""" - -class SpamfilterError(PageNotSaved): - """Saving the page has failed because the MediaWiki spam filter detected a blacklisted URL.""" - def __init__(self, arg): - self.url = arg - self.args = arg, - -class LongPageError(PageNotSaved): - """Saving the page has failed because it is too long.""" - def __init__(self, arg, arg2): - self.length = arg - self.limit = arg2, - -class MaxTriesExceededError(PageNotSaved): - """Saving the page has failed because the maximum number of attempts has been reached""" - -class ServerError(Error): - """Got unexpected server response""" - -class BadTitle(Error): - """Server responded with BadTitle.""" - -# UserBlocked exceptions should in general not be caught. If the bot has -# been blocked, the bot operator should address the reason for the block -# before continuing. -class UserBlocked(Error): - """Your username or IP has been blocked""" - -class PageNotFound(Error): - """Page not found in list""" - -class CaptchaError(Error): - """Captcha is asked and config.solve_captcha == False.""" - -class NoHash(Error): - """ The APIs don't return any Hash for the image searched. - Really Strange, better to raise an error. """ - SaxError = xml.sax._exceptions.SAXParseException
# Pre-compile re expressions @@ -4172,17 +4102,6 @@
# Library functions
-def unescape(s): - """Replace escaped HTML-special characters by their originals""" - if '&' not in s: - return s - s = s.replace("<", "<") - s = s.replace(">", ">") - s = s.replace("'", "'") - s = s.replace(""", '"') - s = s.replace("&", "&") # Must be last - return s - def setAction(s): """Set a summary to use for changed page submissions""" global action @@ -4367,628 +4286,6 @@ finally: self.lock.release()
-# functions to manipulate wikitext strings (by default, all text arguments -# should be Unicode) -# All return the modified text as a unicode object - -def replaceExcept(text, old, new, exceptions, caseInsensitive=False, - allowoverlap=False, marker = '', site = None): - """ - Return text with 'old' replaced by 'new', ignoring specified types of text. - - Skips occurences of 'old' within exceptions; e.g., within nowiki tags or - HTML comments. If caseInsensitive is true, then use case insensitive - regex matching. If allowoverlap is true, overlapping occurences are all - replaced (watch out when using this, it might lead to infinite loops!). - - Parameters: - text - a unicode string - old - a compiled regular expression - new - a unicode string (which can contain regular - expression references), or a function which takes - a match object as parameter. See parameter repl of - re.sub(). - exceptions - a list of strings which signal what to leave out, - e.g. ['math', 'table', 'template'] - caseInsensitive - a boolean - marker - a string that will be added to the last replacement; - if nothing is changed, it is added at the end - - """ - if site is None: - site = getSite() - - exceptionRegexes = { - 'comment': re.compile(r'(?s)<!--.*?-->'), - # section headers - 'header': re.compile(r'\r\n=+.+=+ *\r\n'), - 'includeonly': re.compile(r'(?is)<includeonly>.*?</includeonly>'), - 'math': re.compile(r'(?is)<math>.*?</math>'), - 'noinclude': re.compile(r'(?is)<noinclude>.*?</noinclude>'), - # wiki tags are ignored inside nowiki tags. - 'nowiki': re.compile(r'(?is)<nowiki>.*?</nowiki>'), - # preformatted text - 'pre': re.compile(r'(?ism)<pre>.*?</pre>'), - 'source': re.compile(r'(?is)<source .*?</source>'), - # inline references - 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'), - 'timeline': re.compile(r'(?is)<timeline>.*?</timeline>'), - # lines that start with a space are shown in a monospace font and - # have whitespace preserved. - 'startspace': re.compile(r'(?m)^ (.*?)$'), - # tables often have whitespace that is used to improve wiki - # source code readability. - # TODO: handle nested tables. - 'table': re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'), - # templates with parameters often have whitespace that is used to - # improve wiki source code readability. - # 'template': re.compile(r'(?s){{.*?}}'), - # The regex above fails on nested templates. This regex can handle - # templates cascaded up to level 3, but no deeper. For arbitrary - # depth, we'd need recursion which can't be done in Python's re. - # After all, the language of correct parenthesis words is not regular. - 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'), - 'hyperlink': compileLinkR(), - 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), - # this matches internal wikilinks, but also interwiki, categories, and - # images. - 'link': re.compile(r'[[[^]|]*(|[^]]*)?]]'), - 'interwiki': re.compile(r'(?i)[[(%s)\s?:[^]]*]][\s]*' - % '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())), - - } - - # if we got a string, compile it as a regular expression - if type(old) in [str, unicode]: - if caseInsensitive: - old = re.compile(old, re.IGNORECASE | re.UNICODE) - else: - old = re.compile(old) - - dontTouchRegexes = [] - for exc in exceptions: - if isinstance(exc, str) or isinstance(exc, unicode): - # assume it's a reference to the exceptionRegexes dictionary - # defined above. - if exc not in exceptionRegexes: - raise ValueError("Unknown tag type: " + exc) - dontTouchRegexes.append(exceptionRegexes[exc]) - # handle alias - if exc == 'source': - dontTouchRegexes.append(re.compile(r'(?is)<syntaxhighlight .*?</syntaxhighlight>')) - else: - # assume it's a regular expression - dontTouchRegexes.append(exc) - index = 0 - markerpos = len(text) - while True: - match = old.search(text, index) - if not match: - # nothing left to replace - break - - # check which exception will occur next. - nextExceptionMatch = None - for dontTouchR in dontTouchRegexes: - excMatch = dontTouchR.search(text, index) - if excMatch and ( - nextExceptionMatch is None or - excMatch.start() < nextExceptionMatch.start()): - nextExceptionMatch = excMatch - - if nextExceptionMatch is not None and nextExceptionMatch.start() <= match.start(): - # an HTML comment or text in nowiki tags stands before the next valid match. Skip. - index = nextExceptionMatch.end() - else: - # We found a valid match. Replace it. - if callable(new): - # the parameter new can be a function which takes the match as a parameter. - replacement = new(match) - else: - # it is not a function, but a string. - - # it is a little hack to make \n work. It would be better to fix it - # previously, but better than nothing. - new = new.replace('\n', '\n') - - # We cannot just insert the new string, as it may contain regex - # group references such as \2 or \g<name>. - # On the other hand, this approach does not work because it can't - # handle lookahead or lookbehind (see bug #1731008): - #replacement = old.sub(new, text[match.start():match.end()]) - #text = text[:match.start()] + replacement + text[match.end():] - - # So we have to process the group references manually. - replacement = new - - groupR = re.compile(r'\(?P<number>\d+)|\g<(?P<name>.+?)>') - while True: - groupMatch = groupR.search(replacement) - if not groupMatch: - break - groupID = groupMatch.group('name') or int(groupMatch.group('number')) - replacement = replacement[:groupMatch.start()] + match.group(groupID) + replacement[groupMatch.end():] - text = text[:match.start()] + replacement + text[match.end():] - - # continue the search on the remaining text - if allowoverlap: - index = match.start() + 1 - else: - index = match.start() + len(replacement) - markerpos = match.start() + len(replacement) - text = text[:markerpos] + marker + text[markerpos:] - return text - -def removeDisabledParts(text, tags = ['*']): - """ - Return text without portions where wiki markup is disabled - - Parts that can/will be removed are -- - * HTML comments - * nowiki tags - * pre tags - * includeonly tags - - The exact set of parts which should be removed can be passed as the - 'parts' parameter, which defaults to all. - """ - regexes = { - 'comments' : r'<!--.*?-->', - 'includeonly': r'<includeonly>.*?</includeonly>', - 'nowiki': r'<nowiki>.*?</nowiki>', - 'pre': r'<pre>.*?</pre>', - 'source': r'<source .*?</source>', - } - if '*' in tags: - tags = regexes.keys() - toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]), - re.IGNORECASE | re.DOTALL) - return toRemoveR.sub('', text) - -def isDisabled(text, index, tags = ['*']): - """ - Return True if text[index] is disabled, e.g. by a comment or by nowiki tags. - - For the tags parameter, see removeDisabledParts() above. - """ - # Find a marker that is not already in the text. - marker = findmarker(text, '@@', '@') - text = text[:index] + marker + text[index:] - text = removeDisabledParts(text, tags) - return (marker not in text) - -def findmarker(text, startwith = u'@', append = u'@'): - # find a string which is not part of text - if len(append) <= 0: - append = u'@' - mymarker = startwith - while mymarker in text: - mymarker += append - return mymarker - -def expandmarker(text, marker = '', separator = ''): - # set to remove any number of separator occurrences plus arbitrary - # whitespace before, after, and between them, - # by allowing to include them into marker. - if separator: - firstinmarker = text.find(marker) - firstinseparator = firstinmarker - lenseparator = len(separator) - striploopcontinue = True - while firstinseparator > 0 and striploopcontinue: - striploopcontinue = False - if (firstinseparator >= lenseparator) and (separator == text[firstinseparator-lenseparator:firstinseparator]): - firstinseparator -= lenseparator - striploopcontinue = True - elif text[firstinseparator-1] < ' ': - firstinseparator -= 1 - striploopcontinue = True - marker = text[firstinseparator:firstinmarker] + marker - return marker - -# Part of library dealing with interwiki language links - -# Note - MediaWiki supports two kinds of interwiki links; interlanguage and -# interproject. These functions only deal with links to a -# corresponding page in another language on the same project (e.g., -# Wikipedia, Wiktionary, etc.) in another language. They do not find -# or change links to a different project, or any that are formatted -# as in-line interwiki links (e.g., "[[:es:Articulo]]". (CONFIRM) - -def getLanguageLinks(text, insite = None, pageLink="[[]]", template_subpage=False): - """ - Return a dict of interlanguage links found in text. - - Dict uses language codes as keys and Page objects as values. - Do not call this routine directly, use Page.interwiki() method - instead. - - """ - if insite is None: - insite = getSite() - result = {} - # Ignore interwiki links within nowiki tags, includeonly tags, pre tags, - # and HTML comments - tags = ['comments', 'nowiki', 'pre', 'source'] - if not template_subpage: - tags += ['includeonly'] - text = removeDisabledParts(text, tags) - - # This regular expression will find every link that is possibly an - # interwiki link. - # NOTE: language codes are case-insensitive and only consist of basic latin - # letters and hyphens. - interwikiR = re.compile(r'[[([a-zA-Z-]+)\s?:([^[]\n]*)]]') - for lang, pagetitle in interwikiR.findall(text): - lang = lang.lower() - # Check if it really is in fact an interwiki link to a known - # language, or if it's e.g. a category tag or an internal link - if lang in insite.family.obsolete: - lang = insite.family.obsolete[lang] - if lang in insite.validLanguageLinks(): - if '|' in pagetitle: - # ignore text after the pipe - pagetitle = pagetitle[:pagetitle.index('|')] - # we want the actual page objects rather than the titles - site = insite.getSite(code = lang) - try: - result[site] = Page(site, pagetitle, insite = insite) - except InvalidTitle: - output( - u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]." - % (lang, pagetitle)) - continue - return result - -def removeLanguageLinks(text, site = None, marker = ''): - """Return text with all interlanguage links removed. - - If a link to an unknown language is encountered, a warning is printed. - If a marker is defined, that string is placed at the location of the - last occurence of an interwiki link (at the end if there are no - interwiki links). - - """ - if site is None: - site = getSite() - if not site.validLanguageLinks(): - return text - # This regular expression will find every interwiki link, plus trailing - # whitespace. - languages = '|'.join(site.validLanguageLinks() + site.family.obsolete.keys()) - interwikiR = re.compile(r'[[(%s)\s?:[^]]*]][\s]*' - % languages, re.IGNORECASE) - text = replaceExcept(text, interwikiR, '', - ['nowiki', 'comment', 'math', 'pre', 'source'], marker=marker) - return text.strip() - -def removeLanguageLinksAndSeparator(text, site = None, marker = '', separator = ''): - """Return text with all interlanguage links, plus any preceeding whitespace - and separateor occurrences removed. - - If a link to an unknown language is encountered, a warning is printed. - If a marker is defined, that string is placed at the location of the - last occurence of an interwiki link (at the end if there are no - interwiki links). - - """ - if separator: - mymarker = findmarker(text, u'@L@') - newtext = removeLanguageLinks(text, site, mymarker) - mymarker = expandmarker(newtext, mymarker, separator) - return newtext.replace(mymarker, marker) - else: - return removeLanguageLinks(text, site, marker) - -def replaceLanguageLinks(oldtext, new, site = None, addOnly = False, template = False, template_subpage = False): - """Replace interlanguage links in the text with a new set of links. - - 'new' should be a dict with the Site objects as keys, and Page objects - as values (i.e., just like the dict returned by getLanguageLinks - function). - """ - # Find a marker that is not already in the text. - marker = findmarker( oldtext, u'@@') - if site is None: - site = getSite() - separator = site.family.interwiki_text_separator - cseparator = site.family.category_text_separator - separatorstripped = separator.strip() - cseparatorstripped = cseparator.strip() - if addOnly: - s2 = oldtext - else: - s2 = removeLanguageLinksAndSeparator(oldtext, site = site, marker = marker, separator = separatorstripped) - s = interwikiFormat(new, insite = site) - if s: - if site.language() in site.family.interwiki_attop: - newtext = s + separator + s2.replace(marker,'').strip() - else: - # calculate what was after the language links on the page - firstafter = s2.find(marker) - if firstafter < 0: - firstafter = len(s2) - else: - firstafter += len(marker) - # Is there any text in the 'after' part that means we should keep it after? - if "</noinclude>" in s2[firstafter:]: - if separatorstripped: - s = separator + s - newtext = s2[:firstafter].replace(marker,'') + s + s2[firstafter:] - elif site.language() in site.family.categories_last: - cats = getCategoryLinks(s2, site = site) - s2 = removeCategoryLinksAndSeparator(s2.replace(marker,'',cseparatorstripped).strip(), site) + separator + s - newtext = replaceCategoryLinks(s2, cats, site=site, addOnly=True) - elif site.family.name == 'wikitravel': # for Wikitravel's language links position. - s = separator + s + separator - newtext = s2[:firstafter].replace(marker,'') + s + s2[firstafter:] - else: - if template or template_subpage: - if template_subpage: - includeOn = '<includeonly>' - includeOff = '</includeonly>' - else: - includeOn = '<noinclude>' - includeOff = '</noinclude>' - separator = '' - # Do we have a noinclude at the end of the template? - parts = s2.split(includeOff) - lastpart = parts[-1] - if re.match('\s*%s' % marker, lastpart): - # Put the langlinks back into the noinclude's - regexp = re.compile('%s\s*%s' % (includeOff, marker)) - newtext = regexp.sub(s + includeOff, s2) - else: - # Put the langlinks at the end, inside noinclude's - newtext = s2.replace(marker,'').strip() + separator + u'%s\n%s%s\n' % (includeOn, s, includeOff) - else: - newtext = s2.replace(marker,'').strip() + separator + s - else: - newtext = s2.replace(marker,'') - return newtext - -def interwikiFormat(links, insite = None): - """Convert interwiki link dict into a wikitext string. - - 'links' should be a dict with the Site objects as keys, and Page - objects as values. - - Return a unicode string that is formatted for inclusion in insite - (defaulting to the current site). - """ - if insite is None: - insite = getSite() - if not links: - return '' - - ar = interwikiSort(links.keys(), insite) - s = [] - for site in ar: - try: - link = links[site].aslink(forceInterwiki=True).replace('[[:', '[[') - s.append(link) - except AttributeError: - s.append(getSite(site).linkto(links[site], othersite=insite)) - if insite.lang in insite.family.interwiki_on_one_line: - sep = u' ' - else: - sep = u'\r\n' - s=sep.join(s) + u'\r\n' - return s - -# Sort sites according to local interwiki sort logic -def interwikiSort(sites, insite = None): - if insite is None: - insite = getSite() - if not sites: - return [] - - sites.sort() - putfirst = insite.interwiki_putfirst() - if putfirst: - #In this case I might have to change the order - firstsites = [] - for code in putfirst: - # The code may not exist in this family? - if code in insite.family.obsolete: - code = insite.family.obsolete[code] - if code in insite.validLanguageLinks(): - site = insite.getSite(code = code) - if site in sites: - del sites[sites.index(site)] - firstsites = firstsites + [site] - sites = firstsites + sites - if insite.interwiki_putfirst_doubled(sites): #some implementations return False - sites = insite.interwiki_putfirst_doubled(sites) + sites - - return sites - -# Wikitext manipulation functions dealing with category links -def getCategoryLinks(text, site): - import catlib - """Return a list of category links found in text. - - List contains Category objects. - Do not call this routine directly, use Page.categories() instead. - - """ - result = [] - # Ignore category links within nowiki tags, pre tags, includeonly tags, - # and HTML comments - text = removeDisabledParts(text) - catNamespace = '|'.join(site.category_namespaces()) - R = re.compile(r'[[\s*(?P<namespace>%s)\s*:\s*(?P<catName>.+?)(?:|(?P<sortKey>.+?))?\s*]]' % catNamespace, re.I) - for match in R.finditer(text): - cat = catlib.Category(site, '%s:%s' % (match.group('namespace'), match.group('catName')), sortKey = match.group('sortKey')) - result.append(cat) - return result - -def removeCategoryLinks(text, site, marker = ''): - """Return text with all category links removed. - - Put the string marker after the last replacement (at the end of the text - if there is no replacement). - - """ - # This regular expression will find every link that is possibly an - # interwiki link, plus trailing whitespace. The language code is grouped. - # NOTE: This assumes that language codes only consist of non-capital - # ASCII letters and hyphens. - catNamespace = '|'.join(site.category_namespaces()) - categoryR = re.compile(r'[[\s*(%s)\s*:.*?]]\s*' % catNamespace, re.I) - text = replaceExcept(text, categoryR, '', ['nowiki', 'comment', 'math', 'pre', 'source'], marker = marker) - if marker: - #avoid having multiple linefeeds at the end of the text - text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker, text.strip()) - return text.strip() - -def removeCategoryLinksAndSeparator(text, site = None, marker = '', separator = ''): - """Return text with all category links, plus any preceeding whitespace - and separateor occurrences removed. - - Put the string marker after the last replacement (at the end of the text - if there is no replacement). - - """ - if separator: - mymarker = findmarker(text, u'@C@') - newtext = removeCategoryLinks(text, site, mymarker) - mymarker = expandmarker(newtext, mymarker, separator) - return newtext.replace(mymarker, marker) - else: - return removeCategoryLinks(text, site, marker) - -def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None): - """Replace the category oldcat with the category newcat and return - the modified text. - - """ - if site is None: - site = getSite() - - catNamespace = '|'.join(site.category_namespaces()) - title = oldcat.titleWithoutNamespace() - if not title: - return - # title might contain regex special characters - title = re.escape(title) - # title might not be capitalized correctly on the wiki - if title[0].isalpha() and not site.nocapitalize: - title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:] - # spaces and underscores in page titles are interchangeable, and collapsible - title = title.replace(r"\ ", "[ _]+").replace(r"_", "[ _]+") - categoryR = re.compile(r'[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])' - % (catNamespace, title), re.I) - if newcat is None: - text = replaceExcept(oldtext, categoryR, '', - ['nowiki', 'comment', 'math', 'pre', 'source']) - else: - text = replaceExcept(oldtext, categoryR, - '[[%s:%s\2' % (site.namespace(14), - newcat.titleWithoutNamespace()), - ['nowiki', 'comment', 'math', 'pre', 'source']) - return text - -def replaceCategoryLinks(oldtext, new, site = None, addOnly = False): - """Replace the category links given in the wikitext given - in oldtext by the new links given in new. - - 'new' should be a list of Category objects. - - If addOnly is True, the old category won't be deleted and - the category(s) given will be added - (and so they won't replace anything). - """ - - # Find a marker that is not already in the text. - marker = findmarker( oldtext, u'@@') - if site is None: - site = getSite() - if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext: - raise Error('The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia on pages that contain the person data template because of the non-standard placement of that template. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006#...') - separator = site.family.category_text_separator - iseparator = site.family.interwiki_text_separator - separatorstripped = separator.strip() - iseparatorstripped = iseparator.strip() - if addOnly: - s2 = oldtext - else: - s2 = removeCategoryLinksAndSeparator(oldtext, site = site, marker = marker, separator = separatorstripped) - s = categoryFormat(new, insite = site) - if s: - if site.language() in site.family.category_attop: - newtext = s + separator + s2 - else: - # calculate what was after the categories links on the page - firstafter = s2.find(marker) - if firstafter < 0: - firstafter = len(s2) - else: - firstafter += len(marker) - # Is there any text in the 'after' part that means we should keep it after? - if "</noinclude>" in s2[firstafter:]: - if separatorstripped: - s = separator + s - newtext = s2[:firstafter].replace(marker,'') + s + s2[firstafter:] - elif site.language() in site.family.categories_last: - newtext = s2.replace(marker,'').strip() + separator + s - else: - interwiki = getLanguageLinks(s2) - s2 = removeLanguageLinksAndSeparator(s2.replace(marker,''), site, '', iseparatorstripped) + separator + s - newtext = replaceLanguageLinks(s2, interwiki, site = site, addOnly = True) - else: - newtext = s2.replace(marker,'') - return newtext.strip() - -def categoryFormat(categories, insite = None): - """Return a string containing links to all categories in a list. - - 'categories' should be a list of Category objects. - - The string is formatted for inclusion in insite. - """ - if not categories: - return '' - if insite is None: - insite = getSite() - catLinks = [category.aslink(noInterwiki = True) for category in categories] - if insite.category_on_one_line(): - sep = ' ' - else: - sep = '\r\n' - # Some people don't like the categories sorted - #catLinks.sort() - return sep.join(catLinks) + '\r\n' - -def compileLinkR(withoutBracketed=False, onlyBracketed=False): - """Return a regex that matches external links.""" - # RFC 2396 says that URLs may only contain certain characters. - # For this regex we also accept non-allowed characters, so that the bot - # will later show these links as broken ('Non-ASCII Characters in URL'). - # Note: While allowing parenthesis inside URLs, MediaWiki will regard - # right parenthesis at the end of the URL as not part of that URL. - # The same applies to dot, comma, colon and some other characters. - notAtEnd = ']\s).:;,<>"' - # So characters inside the URL can be anything except whitespace, - # closing squared brackets, quotation marks, greater than and less - # than, and the last character also can't be parenthesis or another - # character disallowed by MediaWiki. - notInside = ']\s<>"' - # The first half of this regular expression is required because '' is - # not allowed inside links. For example, in this wiki text: - # ''Please see http://www.example.org.'' - # .'' shouldn't be considered as part of the link. - regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \ - + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside \ - + ']*[^' + notAtEnd + '])' - - if withoutBracketed: - regex = r'(?<![)' + regex - elif onlyBracketed: - regex = r'[' + regex - linkR = re.compile(regex) - return linkR - # end of category specific code def url2link(percentname, insite, site): """Convert urlname of a wiki page into interwiki link format. @@ -8130,247 +7427,6 @@ # Set socket timeout socket.setdefaulttimeout(config.socket_timeout)
-# Languages to use for comment text after the actual language but before -# en:. For example, if for language 'xx', you want the preference of -# languages to be: -# xx:, then fr:, then ru:, then en: -# you let altlang return ['fr','ru']. -# This code is used by translate() below. - -def altlang(code): - #Amharic - if code in ['aa', 'om']: - return ['am'] - #Arab - if code in ['arc', 'arz']: - return ['ar'] - if code == 'kab': - return ['ar', 'fr'] - #Bulgarian - if code in ['cu', 'mk']: - return ['bg', 'sr', 'sh'] - #Czech - if code in ['cs', 'sk']: - return ['cs', 'sk'] - #German - if code in ['bar', 'ksh', 'pdc']: - return ['de'] - if code in ['als', 'lb']: - return ['de', 'fr'] - if code == 'nds': - return ['nds-nl', 'de'] - if code in ['dsb', 'hsb']: - return ['hsb', 'dsb', 'de'] - if code == 'rm': - return ['de', 'it'] - if code == 'stq': - return ['fy', 'de'] - #Greek - if code == 'pnt': - return ['el'] - #Esperanto - if code in ['io', 'nov']: - return ['eo'] - #Spanish - if code in ['an', 'ast', 'ay', 'ca', 'ext', 'lad', 'nah', 'nv', 'qu']: - return ['es'] - if code in ['gl', 'gn']: - return ['es', 'pt'] - if code == ['eu']: - return ['es', 'fr'] - if code in ['bcl', 'cbk-zam', 'ceb', 'ilo', 'pag', 'pam', 'tl', 'war']: - return ['es', 'tl'] - #Estonian - if code == 'fiu-vro': - return ['et'] - #Persian (Farsi) - if code in ['glk', 'mzn']: - return ['ar'] - #French - if code in ['bm', 'br', 'ht', 'kab', 'kg', 'ln', 'mg', 'nrm', 'oc', - 'pcd', 'rw', 'sg', 'ty', 'wa']: - return ['fr'] - if code == 'co': - return ['fr', 'it'] - #Hindi - if code in ['bh', 'pi', 'sa']: - return ['hi'] - if code in ['ne', 'new']: - return ['ne', 'new', 'hi'] - #Indonesian and Malay - if code in ['ace', 'bug', 'id', 'jv', 'ms', 'su']: - return ['id', 'ms', 'jv'] - if code == 'map-bms': - return ['jv', 'id', 'ms'] - #Inuit languages - if code in ['ik', 'iu']: - return ['iu', 'kl'] - if code == 'kl': - return ['iu', 'da', 'no'] - #Italian - if code in ['eml', 'fur', 'lij', 'lmo', 'nap', 'pms', 'roa-tara', 'sc', - 'scn', 'vec']: - return ['it'] - if code == 'frp': - return ['it', 'fr'] - #Lithuanian - if code in ['bat-smg', 'ltg']: - return ['lt'] - #Dutch - if code in ['fy', 'li', 'pap', 'srn', 'vls', 'zea']: - return ['nl'] - if code == ['nds-nl']: - return ['nds', 'nl'] - #Polish - if code in ['csb', 'szl']: - return ['pl'] - #Portuguese - if code in ['fab', 'mwl', 'tet']: - return ['pt'] - #Romanian - if code in ['mo', 'roa-rup']: - return ['ro'] - #Russian and Belarusian - if code in ['ab', 'av', 'ba', 'bxr', 'ce', 'cv', 'kk', 'ky', 'lbe', 'mdf', - 'mhr', 'myv', 'os', 'sah', 'tg', 'tt', 'udm', 'uk', 'xal']: - return ['ru'] - if code in ['be', 'be-x-old']: - return ['be', 'be-x-old', 'ru'] - if code == 'kaa': - return ['uz', 'ru'] - #Serbocroatian - if code in ['bs', 'hr', 'sh', 'sr']: - return ['sh', 'hr', 'bs', 'sr'] - #Turkish and Kurdish - if code in ['diq', 'ku']: - return ['ku', 'tr'] - if code == 'ckb': - return ['ku', 'ar'] - #Chinese - if code in ['minnan', 'zh', 'zh-classical', 'zh-min-nan', 'zh-tw', 'zh-hans', 'zh-hant']: - return ['zh', 'zh-tw', 'zh-cn', 'zh-classical'] - if code in ['cdo', 'gan', 'hak', 'ii', 'wuu', 'za', 'zh-cdo', 'zh-classical', - 'zh-cn', 'zh-yue']: - return ['zh', 'zh-cn', 'zh-tw', 'zh-classical'] - #Scandinavian languages - if code in ['da', 'sv']: - return ['da', 'no', 'nb', 'sv', 'nn'] - if code in ['fo', 'is']: - return ['da', 'no', 'nb', 'nn', 'sv'] - if code == 'nn': - return ['no', 'nb', 'sv', 'da'] - if code in ['nb', 'no']: - return ['no', 'nb', 'da', 'nn', 'sv'] - if code == 'se': - return ['sv', 'no', 'nb', 'nn', 'fi'] - #Other languages - if code in ['bi', 'tpi']: - return ['bi', 'tpi'] - if code == 'yi': - return ['he', 'de'] - if code in ['ia', 'ie']: - return ['ia', 'la', 'it', 'fr', 'es'] - #Default value - return [] - -def translate(code, xdict): - """Return the most appropriate translation from a translation dict. - - Given a language code and a dictionary, returns the dictionary's value for - key 'code' if this key exists; otherwise tries to return a value for an - alternative language that is most applicable to use on the Wikipedia in - language 'code'. - - The language itself is always checked first, then languages that - have been defined to be alternatives, and finally English. If none of - the options gives result, we just take the first language in the - list. - - """ - # If a site is given instead of a code, use its language - if hasattr(code,'lang'): - code = code.lang - - if 'wikipedia' in xdict: # If xdict attribute is wikipedia, define the xdite had multiple projects - if default_family in xdict: - xdict = xdict[default_family] - else: - xdict = xdict['wikipedia'] - - if type(xdict) != dict: - return xdict - - if code in xdict: - return xdict[code] - for alt in altlang(code): - if alt in xdict: - return xdict[alt] - if '_default' in xdict: - return xdict['_default'] - elif 'en' in xdict: - return xdict['en'] - return xdict.values()[0] - -def showDiff(oldtext, newtext): - """ - Prints a string showing the differences between oldtext and newtext. - The differences are highlighted (only on Unix systems) to show which - changes were made. - """ - # For information on difflib, see http://docs.python.org/library/difflib.html - color = { - '+': 'lightgreen', - '-': 'lightred', - } - diff = u'' - colors = [] - # This will store the last line beginning with + or -. - lastline = None - # For testing purposes only: show original, uncolored diff - # for line in difflib.ndiff(oldtext.splitlines(), newtext.splitlines()): - # print line - for line in difflib.ndiff(oldtext.splitlines(), newtext.splitlines()): - if line.startswith('?'): - # initialize color vector with None, which means default color - lastcolors = [None for c in lastline] - # colorize the + or - sign - lastcolors[0] = color[lastline[0]] - # colorize changed parts in red or green - for i in range(min(len(line), len(lastline))): - if line[i] != ' ': - lastcolors[i] = color[lastline[0]] - diff += lastline + '\n' - # append one None (default color) for the newline character - colors += lastcolors + [None] - elif lastline: - diff += lastline + '\n' - # colorize the + or - sign only - lastcolors = [None for c in lastline] - lastcolors[0] = color[lastline[0]] - colors += lastcolors + [None] - lastline = None - if line[0] in ('+', '-'): - lastline = line - # there might be one + or - line left that wasn't followed by a ? line. - if lastline: - diff += lastline + '\n' - # colorize the + or - sign only - lastcolors = [None for c in lastline] - lastcolors[0] = color[lastline[0]] - colors += lastcolors + [None] - - result = u'' - lastcolor = None - for i in range(len(diff)): - if colors[i] != lastcolor: - if lastcolor is None: - result += '\03{%s}' % colors[i] - else: - result += '\03{default}' - lastcolor = colors[i] - result += diff[i] - output(result) - def writeToCommandLogFile(): """ Save the name of the called module along with all parameters to @@ -8696,7 +7752,7 @@ output( u'ERROR: %s caused error %s. Dump %s created.' % (name,error,filename) )
get_throttle = Throttle(config.minthrottle,config.maxthrottle) -put_throttle = Throttle(config.put_throttle,config.put_throttle,False) +put_throttle = Throttle(config.put_throttle,config.put_throttle,multiplydelay=False)
def decompress_gzip(data): # Use cStringIO if available
pywikipedia-svn@lists.wikimedia.org