Revision: 7962
Author: xqt
Date: 2010-02-25 14:00:14 +0000 (Thu, 25 Feb 2010)
Log Message:
-----------
handle <source /> alias from trunk
Modified Paths:
--------------
branches/rewrite/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py 2010-02-25 13:42:02 UTC (rev 7961)
+++ branches/rewrite/pywikibot/textlib.py 2010-02-25 14:00:14 UTC (rev 7962)
@@ -201,14 +201,19 @@
'parts' parameter, which defaults to all.
"""
regexes = {
- 'comments' : r'<!--.*?-->',
- 'includeonly': r'<includeonly>.*?</includeonly>',
- 'nowiki': r'<nowiki>.*?</nowiki>',
- 'pre': r'<pre>.*?</pre>',
- 'source': r'<source .*?</source>',
+ 'comments' : r'<!--.*?-->',
+ 'includeonly': r'<includeonly>.*?</includeonly>',
+ 'nowiki': r'<nowiki>.*?</nowiki>',
+ 'pre': r'<pre>.*?</pre>',
+ 'source': r'<source .*?</source>',
+ 'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>',
}
if '*' in tags:
tags = regexes.keys()
+ # add alias
+ tags = set(tags)
+ if 'source' in tags:
+ tags.add('syntaxhighlight')
toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]),
re.IGNORECASE | re.DOTALL)
return toRemoveR.sub('', text)
@@ -259,9 +264,9 @@
marker = text[firstinseparator:firstinmarker] + marker
return marker
-
+#-------------------------------------------------
# Functions dealing with interwiki language links
-
+#-------------------------------------------------
# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
# interproject. These functions only deal with links to a
# corresponding page in another language on the same project (e.g.,
@@ -475,8 +480,9 @@
sites = insite.interwiki_putfirst_doubled(sites) + sites
return sites
-
+#---------------------------------------
# Functions dealing with category links
+#---------------------------------------
def getCategoryLinks(text, site):
"""Return a list of category links found in text.
@@ -660,6 +666,9 @@
#catLinks.sort()
return sep.join(catLinks) + '\r\n'
+#---------------------------------------
+# Functions dealing with external links
+#---------------------------------------
def compileLinkR(withoutBracketed=False, onlyBracketed=False):
"""Return a regex that matches external links."""
@@ -690,6 +699,9 @@
linkR = re.compile(regex)
return linkR
+#----------------------------------
+# Functions dealing with templates
+#----------------------------------
def extract_templates_and_params(text, get_redirect=False):
"""Return list of template calls found in text.
@@ -800,7 +812,9 @@
result.append((name, params))
return result
+#----------------
# I18N functions
+#----------------
# Languages to use for comment text after the actual language but before
# en:. For example, if for language 'xx', you want the preference of
Revision: 7961
Author: xqt
Date: 2010-02-25 13:42:02 +0000 (Thu, 25 Feb 2010)
Log Message:
-----------
activate wikipedia library
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-02-25 10:28:15 UTC (rev 7960)
+++ trunk/pywikipedia/wikipedia.py 2010-02-25 13:42:02 UTC (rev 7961)
@@ -139,6 +139,8 @@
import xmlreader
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, SoupStrainer
import weakref
+# Splitting the bot into library parts
+from pywikibot import *
# Set the locale to system default. This will ensure correct string
# handling for non-latin characters on Python 2.3.x. For Python 2.4.x it's no
@@ -161,78 +163,6 @@
WIDEBUILD = False
-# Local exceptions
-
-class Error(Exception):
- """Wikipedia error"""
-
-class NoUsername(Error):
- """Username is not in user-config.py"""
-
-class NoPage(Error):
- """Page does not exist"""
-
-class NoSuchSite(Error):
- """Site does not exist"""
-
-class IsRedirectPage(Error):
- """Page is a redirect page"""
-
-class IsNotRedirectPage(Error):
- """Page is not a redirect page"""
-
-class InvalidTitle(Error):
- """Invalid page title"""
-
-class LockedPage(Error):
- """Page is locked"""
-
-class SectionError(Error):
- """The section specified by # does not exist"""
-
-class PageNotSaved(Error):
- """Saving the page has failed"""
-
-class EditConflict(PageNotSaved):
- """There has been an edit conflict while uploading the page"""
-
-class SpamfilterError(PageNotSaved):
- """Saving the page has failed because the MediaWiki spam filter detected a blacklisted URL."""
- def __init__(self, arg):
- self.url = arg
- self.args = arg,
-
-class LongPageError(PageNotSaved):
- """Saving the page has failed because it is too long."""
- def __init__(self, arg, arg2):
- self.length = arg
- self.limit = arg2,
-
-class MaxTriesExceededError(PageNotSaved):
- """Saving the page has failed because the maximum number of attempts has been reached"""
-
-class ServerError(Error):
- """Got unexpected server response"""
-
-class BadTitle(Error):
- """Server responded with BadTitle."""
-
-# UserBlocked exceptions should in general not be caught. If the bot has
-# been blocked, the bot operator should address the reason for the block
-# before continuing.
-class UserBlocked(Error):
- """Your username or IP has been blocked"""
-
-class PageNotFound(Error):
- """Page not found in list"""
-
-class CaptchaError(Error):
- """Captcha is asked and config.solve_captcha == False."""
-
-class NoHash(Error):
- """ The APIs don't return any Hash for the image searched.
- Really Strange, better to raise an error. """
-
SaxError = xml.sax._exceptions.SAXParseException
# Pre-compile re expressions
@@ -4172,17 +4102,6 @@
# Library functions
-def unescape(s):
- """Replace escaped HTML-special characters by their originals"""
- if '&' not in s:
- return s
- s = s.replace("<", "<")
- s = s.replace(">", ">")
- s = s.replace("'", "'")
- s = s.replace(""", '"')
- s = s.replace("&", "&") # Must be last
- return s
-
def setAction(s):
"""Set a summary to use for changed page submissions"""
global action
@@ -4367,628 +4286,6 @@
finally:
self.lock.release()
-# functions to manipulate wikitext strings (by default, all text arguments
-# should be Unicode)
-# All return the modified text as a unicode object
-
-def replaceExcept(text, old, new, exceptions, caseInsensitive=False,
- allowoverlap=False, marker = '', site = None):
- """
- Return text with 'old' replaced by 'new', ignoring specified types of text.
-
- Skips occurences of 'old' within exceptions; e.g., within nowiki tags or
- HTML comments. If caseInsensitive is true, then use case insensitive
- regex matching. If allowoverlap is true, overlapping occurences are all
- replaced (watch out when using this, it might lead to infinite loops!).
-
- Parameters:
- text - a unicode string
- old - a compiled regular expression
- new - a unicode string (which can contain regular
- expression references), or a function which takes
- a match object as parameter. See parameter repl of
- re.sub().
- exceptions - a list of strings which signal what to leave out,
- e.g. ['math', 'table', 'template']
- caseInsensitive - a boolean
- marker - a string that will be added to the last replacement;
- if nothing is changed, it is added at the end
-
- """
- if site is None:
- site = getSite()
-
- exceptionRegexes = {
- 'comment': re.compile(r'(?s)<!--.*?-->'),
- # section headers
- 'header': re.compile(r'\r\n=+.+=+ *\r\n'),
- 'includeonly': re.compile(r'(?is)<includeonly>.*?</includeonly>'),
- 'math': re.compile(r'(?is)<math>.*?</math>'),
- 'noinclude': re.compile(r'(?is)<noinclude>.*?</noinclude>'),
- # wiki tags are ignored inside nowiki tags.
- 'nowiki': re.compile(r'(?is)<nowiki>.*?</nowiki>'),
- # preformatted text
- 'pre': re.compile(r'(?ism)<pre>.*?</pre>'),
- 'source': re.compile(r'(?is)<source .*?</source>'),
- # inline references
- 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'),
- 'timeline': re.compile(r'(?is)<timeline>.*?</timeline>'),
- # lines that start with a space are shown in a monospace font and
- # have whitespace preserved.
- 'startspace': re.compile(r'(?m)^ (.*?)$'),
- # tables often have whitespace that is used to improve wiki
- # source code readability.
- # TODO: handle nested tables.
- 'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
- # templates with parameters often have whitespace that is used to
- # improve wiki source code readability.
- # 'template': re.compile(r'(?s){{.*?}}'),
- # The regex above fails on nested templates. This regex can handle
- # templates cascaded up to level 3, but no deeper. For arbitrary
- # depth, we'd need recursion which can't be done in Python's re.
- # After all, the language of correct parenthesis words is not regular.
- 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'),
- 'hyperlink': compileLinkR(),
- 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
- # this matches internal wikilinks, but also interwiki, categories, and
- # images.
- 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
- 'interwiki': re.compile(r'(?i)\[\[(%s)\s?:[^\]]*\]\][\s]*'
- % '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())),
-
- }
-
- # if we got a string, compile it as a regular expression
- if type(old) in [str, unicode]:
- if caseInsensitive:
- old = re.compile(old, re.IGNORECASE | re.UNICODE)
- else:
- old = re.compile(old)
-
- dontTouchRegexes = []
- for exc in exceptions:
- if isinstance(exc, str) or isinstance(exc, unicode):
- # assume it's a reference to the exceptionRegexes dictionary
- # defined above.
- if exc not in exceptionRegexes:
- raise ValueError("Unknown tag type: " + exc)
- dontTouchRegexes.append(exceptionRegexes[exc])
- # handle alias
- if exc == 'source':
- dontTouchRegexes.append(re.compile(r'(?is)<syntaxhighlight .*?</syntaxhighlight>'))
- else:
- # assume it's a regular expression
- dontTouchRegexes.append(exc)
- index = 0
- markerpos = len(text)
- while True:
- match = old.search(text, index)
- if not match:
- # nothing left to replace
- break
-
- # check which exception will occur next.
- nextExceptionMatch = None
- for dontTouchR in dontTouchRegexes:
- excMatch = dontTouchR.search(text, index)
- if excMatch and (
- nextExceptionMatch is None or
- excMatch.start() < nextExceptionMatch.start()):
- nextExceptionMatch = excMatch
-
- if nextExceptionMatch is not None and nextExceptionMatch.start() <= match.start():
- # an HTML comment or text in nowiki tags stands before the next valid match. Skip.
- index = nextExceptionMatch.end()
- else:
- # We found a valid match. Replace it.
- if callable(new):
- # the parameter new can be a function which takes the match as a parameter.
- replacement = new(match)
- else:
- # it is not a function, but a string.
-
- # it is a little hack to make \n work. It would be better to fix it
- # previously, but better than nothing.
- new = new.replace('\\n', '\n')
-
- # We cannot just insert the new string, as it may contain regex
- # group references such as \2 or \g<name>.
- # On the other hand, this approach does not work because it can't
- # handle lookahead or lookbehind (see bug #1731008):
- #replacement = old.sub(new, text[match.start():match.end()])
- #text = text[:match.start()] + replacement + text[match.end():]
-
- # So we have to process the group references manually.
- replacement = new
-
- groupR = re.compile(r'\\(?P<number>\d+)|\\g<(?P<name>.+?)>')
- while True:
- groupMatch = groupR.search(replacement)
- if not groupMatch:
- break
- groupID = groupMatch.group('name') or int(groupMatch.group('number'))
- replacement = replacement[:groupMatch.start()] + match.group(groupID) + replacement[groupMatch.end():]
- text = text[:match.start()] + replacement + text[match.end():]
-
- # continue the search on the remaining text
- if allowoverlap:
- index = match.start() + 1
- else:
- index = match.start() + len(replacement)
- markerpos = match.start() + len(replacement)
- text = text[:markerpos] + marker + text[markerpos:]
- return text
-
-def removeDisabledParts(text, tags = ['*']):
- """
- Return text without portions where wiki markup is disabled
-
- Parts that can/will be removed are --
- * HTML comments
- * nowiki tags
- * pre tags
- * includeonly tags
-
- The exact set of parts which should be removed can be passed as the
- 'parts' parameter, which defaults to all.
- """
- regexes = {
- 'comments' : r'<!--.*?-->',
- 'includeonly': r'<includeonly>.*?</includeonly>',
- 'nowiki': r'<nowiki>.*?</nowiki>',
- 'pre': r'<pre>.*?</pre>',
- 'source': r'<source .*?</source>',
- }
- if '*' in tags:
- tags = regexes.keys()
- toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]),
- re.IGNORECASE | re.DOTALL)
- return toRemoveR.sub('', text)
-
-def isDisabled(text, index, tags = ['*']):
- """
- Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
-
- For the tags parameter, see removeDisabledParts() above.
- """
- # Find a marker that is not already in the text.
- marker = findmarker(text, '@@', '@')
- text = text[:index] + marker + text[index:]
- text = removeDisabledParts(text, tags)
- return (marker not in text)
-
-def findmarker(text, startwith = u'@', append = u'@'):
- # find a string which is not part of text
- if len(append) <= 0:
- append = u'@'
- mymarker = startwith
- while mymarker in text:
- mymarker += append
- return mymarker
-
-def expandmarker(text, marker = '', separator = ''):
- # set to remove any number of separator occurrences plus arbitrary
- # whitespace before, after, and between them,
- # by allowing to include them into marker.
- if separator:
- firstinmarker = text.find(marker)
- firstinseparator = firstinmarker
- lenseparator = len(separator)
- striploopcontinue = True
- while firstinseparator > 0 and striploopcontinue:
- striploopcontinue = False
- if (firstinseparator >= lenseparator) and (separator == text[firstinseparator-lenseparator:firstinseparator]):
- firstinseparator -= lenseparator
- striploopcontinue = True
- elif text[firstinseparator-1] < ' ':
- firstinseparator -= 1
- striploopcontinue = True
- marker = text[firstinseparator:firstinmarker] + marker
- return marker
-
-# Part of library dealing with interwiki language links
-
-# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
-# interproject. These functions only deal with links to a
-# corresponding page in another language on the same project (e.g.,
-# Wikipedia, Wiktionary, etc.) in another language. They do not find
-# or change links to a different project, or any that are formatted
-# as in-line interwiki links (e.g., "[[:es:Articulo]]". (CONFIRM)
-
-def getLanguageLinks(text, insite = None, pageLink="[[]]", template_subpage=False):
- """
- Return a dict of interlanguage links found in text.
-
- Dict uses language codes as keys and Page objects as values.
- Do not call this routine directly, use Page.interwiki() method
- instead.
-
- """
- if insite is None:
- insite = getSite()
- result = {}
- # Ignore interwiki links within nowiki tags, includeonly tags, pre tags,
- # and HTML comments
- tags = ['comments', 'nowiki', 'pre', 'source']
- if not template_subpage:
- tags += ['includeonly']
- text = removeDisabledParts(text, tags)
-
- # This regular expression will find every link that is possibly an
- # interwiki link.
- # NOTE: language codes are case-insensitive and only consist of basic latin
- # letters and hyphens.
- interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
- for lang, pagetitle in interwikiR.findall(text):
- lang = lang.lower()
- # Check if it really is in fact an interwiki link to a known
- # language, or if it's e.g. a category tag or an internal link
- if lang in insite.family.obsolete:
- lang = insite.family.obsolete[lang]
- if lang in insite.validLanguageLinks():
- if '|' in pagetitle:
- # ignore text after the pipe
- pagetitle = pagetitle[:pagetitle.index('|')]
- # we want the actual page objects rather than the titles
- site = insite.getSite(code = lang)
- try:
- result[site] = Page(site, pagetitle, insite = insite)
- except InvalidTitle:
- output(
- u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
- % (lang, pagetitle))
- continue
- return result
-
-def removeLanguageLinks(text, site = None, marker = ''):
- """Return text with all interlanguage links removed.
-
- If a link to an unknown language is encountered, a warning is printed.
- If a marker is defined, that string is placed at the location of the
- last occurence of an interwiki link (at the end if there are no
- interwiki links).
-
- """
- if site is None:
- site = getSite()
- if not site.validLanguageLinks():
- return text
- # This regular expression will find every interwiki link, plus trailing
- # whitespace.
- languages = '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())
- interwikiR = re.compile(r'\[\[(%s)\s?:[^\]]*\]\][\s]*'
- % languages, re.IGNORECASE)
- text = replaceExcept(text, interwikiR, '',
- ['nowiki', 'comment', 'math', 'pre', 'source'], marker=marker)
- return text.strip()
-
-def removeLanguageLinksAndSeparator(text, site = None, marker = '', separator = ''):
- """Return text with all interlanguage links, plus any preceeding whitespace
- and separateor occurrences removed.
-
- If a link to an unknown language is encountered, a warning is printed.
- If a marker is defined, that string is placed at the location of the
- last occurence of an interwiki link (at the end if there are no
- interwiki links).
-
- """
- if separator:
- mymarker = findmarker(text, u'@L@')
- newtext = removeLanguageLinks(text, site, mymarker)
- mymarker = expandmarker(newtext, mymarker, separator)
- return newtext.replace(mymarker, marker)
- else:
- return removeLanguageLinks(text, site, marker)
-
-def replaceLanguageLinks(oldtext, new, site = None, addOnly = False, template = False, template_subpage = False):
- """Replace interlanguage links in the text with a new set of links.
-
- 'new' should be a dict with the Site objects as keys, and Page objects
- as values (i.e., just like the dict returned by getLanguageLinks
- function).
- """
- # Find a marker that is not already in the text.
- marker = findmarker( oldtext, u'@@')
- if site is None:
- site = getSite()
- separator = site.family.interwiki_text_separator
- cseparator = site.family.category_text_separator
- separatorstripped = separator.strip()
- cseparatorstripped = cseparator.strip()
- if addOnly:
- s2 = oldtext
- else:
- s2 = removeLanguageLinksAndSeparator(oldtext, site = site, marker = marker, separator = separatorstripped)
- s = interwikiFormat(new, insite = site)
- if s:
- if site.language() in site.family.interwiki_attop:
- newtext = s + separator + s2.replace(marker,'').strip()
- else:
- # calculate what was after the language links on the page
- firstafter = s2.find(marker)
- if firstafter < 0:
- firstafter = len(s2)
- else:
- firstafter += len(marker)
- # Is there any text in the 'after' part that means we should keep it after?
- if "</noinclude>" in s2[firstafter:]:
- if separatorstripped:
- s = separator + s
- newtext = s2[:firstafter].replace(marker,'') + s + s2[firstafter:]
- elif site.language() in site.family.categories_last:
- cats = getCategoryLinks(s2, site = site)
- s2 = removeCategoryLinksAndSeparator(s2.replace(marker,'',cseparatorstripped).strip(), site) + separator + s
- newtext = replaceCategoryLinks(s2, cats, site=site, addOnly=True)
- elif site.family.name == 'wikitravel': # for Wikitravel's language links position.
- s = separator + s + separator
- newtext = s2[:firstafter].replace(marker,'') + s + s2[firstafter:]
- else:
- if template or template_subpage:
- if template_subpage:
- includeOn = '<includeonly>'
- includeOff = '</includeonly>'
- else:
- includeOn = '<noinclude>'
- includeOff = '</noinclude>'
- separator = ''
- # Do we have a noinclude at the end of the template?
- parts = s2.split(includeOff)
- lastpart = parts[-1]
- if re.match('\s*%s' % marker, lastpart):
- # Put the langlinks back into the noinclude's
- regexp = re.compile('%s\s*%s' % (includeOff, marker))
- newtext = regexp.sub(s + includeOff, s2)
- else:
- # Put the langlinks at the end, inside noinclude's
- newtext = s2.replace(marker,'').strip() + separator + u'%s\n%s%s\n' % (includeOn, s, includeOff)
- else:
- newtext = s2.replace(marker,'').strip() + separator + s
- else:
- newtext = s2.replace(marker,'')
- return newtext
-
-def interwikiFormat(links, insite = None):
- """Convert interwiki link dict into a wikitext string.
-
- 'links' should be a dict with the Site objects as keys, and Page
- objects as values.
-
- Return a unicode string that is formatted for inclusion in insite
- (defaulting to the current site).
- """
- if insite is None:
- insite = getSite()
- if not links:
- return ''
-
- ar = interwikiSort(links.keys(), insite)
- s = []
- for site in ar:
- try:
- link = links[site].aslink(forceInterwiki=True).replace('[[:', '[[')
- s.append(link)
- except AttributeError:
- s.append(getSite(site).linkto(links[site], othersite=insite))
- if insite.lang in insite.family.interwiki_on_one_line:
- sep = u' '
- else:
- sep = u'\r\n'
- s=sep.join(s) + u'\r\n'
- return s
-
-# Sort sites according to local interwiki sort logic
-def interwikiSort(sites, insite = None):
- if insite is None:
- insite = getSite()
- if not sites:
- return []
-
- sites.sort()
- putfirst = insite.interwiki_putfirst()
- if putfirst:
- #In this case I might have to change the order
- firstsites = []
- for code in putfirst:
- # The code may not exist in this family?
- if code in insite.family.obsolete:
- code = insite.family.obsolete[code]
- if code in insite.validLanguageLinks():
- site = insite.getSite(code = code)
- if site in sites:
- del sites[sites.index(site)]
- firstsites = firstsites + [site]
- sites = firstsites + sites
- if insite.interwiki_putfirst_doubled(sites): #some implementations return False
- sites = insite.interwiki_putfirst_doubled(sites) + sites
-
- return sites
-
-# Wikitext manipulation functions dealing with category links
-def getCategoryLinks(text, site):
- import catlib
- """Return a list of category links found in text.
-
- List contains Category objects.
- Do not call this routine directly, use Page.categories() instead.
-
- """
- result = []
- # Ignore category links within nowiki tags, pre tags, includeonly tags,
- # and HTML comments
- text = removeDisabledParts(text)
- catNamespace = '|'.join(site.category_namespaces())
- R = re.compile(r'\[\[\s*(?P<namespace>%s)\s*:\s*(?P<catName>.+?)(?:\|(?P<sortKey>.+?))?\s*\]\]' % catNamespace, re.I)
- for match in R.finditer(text):
- cat = catlib.Category(site, '%s:%s' % (match.group('namespace'), match.group('catName')), sortKey = match.group('sortKey'))
- result.append(cat)
- return result
-
-def removeCategoryLinks(text, site, marker = ''):
- """Return text with all category links removed.
-
- Put the string marker after the last replacement (at the end of the text
- if there is no replacement).
-
- """
- # This regular expression will find every link that is possibly an
- # interwiki link, plus trailing whitespace. The language code is grouped.
- # NOTE: This assumes that language codes only consist of non-capital
- # ASCII letters and hyphens.
- catNamespace = '|'.join(site.category_namespaces())
- categoryR = re.compile(r'\[\[\s*(%s)\s*:.*?\]\]\s*' % catNamespace, re.I)
- text = replaceExcept(text, categoryR, '', ['nowiki', 'comment', 'math', 'pre', 'source'], marker = marker)
- if marker:
- #avoid having multiple linefeeds at the end of the text
- text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker, text.strip())
- return text.strip()
-
-def removeCategoryLinksAndSeparator(text, site = None, marker = '', separator = ''):
- """Return text with all category links, plus any preceeding whitespace
- and separateor occurrences removed.
-
- Put the string marker after the last replacement (at the end of the text
- if there is no replacement).
-
- """
- if separator:
- mymarker = findmarker(text, u'@C@')
- newtext = removeCategoryLinks(text, site, mymarker)
- mymarker = expandmarker(newtext, mymarker, separator)
- return newtext.replace(mymarker, marker)
- else:
- return removeCategoryLinks(text, site, marker)
-
-def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None):
- """Replace the category oldcat with the category newcat and return
- the modified text.
-
- """
- if site is None:
- site = getSite()
-
- catNamespace = '|'.join(site.category_namespaces())
- title = oldcat.titleWithoutNamespace()
- if not title:
- return
- # title might contain regex special characters
- title = re.escape(title)
- # title might not be capitalized correctly on the wiki
- if title[0].isalpha() and not site.nocapitalize:
- title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:]
- # spaces and underscores in page titles are interchangeable, and collapsible
- title = title.replace(r"\ ", "[ _]+").replace(r"\_", "[ _]+")
- categoryR = re.compile(r'\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])'
- % (catNamespace, title), re.I)
- if newcat is None:
- text = replaceExcept(oldtext, categoryR, '',
- ['nowiki', 'comment', 'math', 'pre', 'source'])
- else:
- text = replaceExcept(oldtext, categoryR,
- '[[%s:%s\\2' % (site.namespace(14),
- newcat.titleWithoutNamespace()),
- ['nowiki', 'comment', 'math', 'pre', 'source'])
- return text
-
-def replaceCategoryLinks(oldtext, new, site = None, addOnly = False):
- """Replace the category links given in the wikitext given
- in oldtext by the new links given in new.
-
- 'new' should be a list of Category objects.
-
- If addOnly is True, the old category won't be deleted and
- the category(s) given will be added
- (and so they won't replace anything).
- """
-
- # Find a marker that is not already in the text.
- marker = findmarker( oldtext, u'@@')
- if site is None:
- site = getSite()
- if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext:
- raise Error('The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia on pages that contain the person data template because of the non-standard placement of that template. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006…')
- separator = site.family.category_text_separator
- iseparator = site.family.interwiki_text_separator
- separatorstripped = separator.strip()
- iseparatorstripped = iseparator.strip()
- if addOnly:
- s2 = oldtext
- else:
- s2 = removeCategoryLinksAndSeparator(oldtext, site = site, marker = marker, separator = separatorstripped)
- s = categoryFormat(new, insite = site)
- if s:
- if site.language() in site.family.category_attop:
- newtext = s + separator + s2
- else:
- # calculate what was after the categories links on the page
- firstafter = s2.find(marker)
- if firstafter < 0:
- firstafter = len(s2)
- else:
- firstafter += len(marker)
- # Is there any text in the 'after' part that means we should keep it after?
- if "</noinclude>" in s2[firstafter:]:
- if separatorstripped:
- s = separator + s
- newtext = s2[:firstafter].replace(marker,'') + s + s2[firstafter:]
- elif site.language() in site.family.categories_last:
- newtext = s2.replace(marker,'').strip() + separator + s
- else:
- interwiki = getLanguageLinks(s2)
- s2 = removeLanguageLinksAndSeparator(s2.replace(marker,''), site, '', iseparatorstripped) + separator + s
- newtext = replaceLanguageLinks(s2, interwiki, site = site, addOnly = True)
- else:
- newtext = s2.replace(marker,'')
- return newtext.strip()
-
-def categoryFormat(categories, insite = None):
- """Return a string containing links to all categories in a list.
-
- 'categories' should be a list of Category objects.
-
- The string is formatted for inclusion in insite.
- """
- if not categories:
- return ''
- if insite is None:
- insite = getSite()
- catLinks = [category.aslink(noInterwiki = True) for category in categories]
- if insite.category_on_one_line():
- sep = ' '
- else:
- sep = '\r\n'
- # Some people don't like the categories sorted
- #catLinks.sort()
- return sep.join(catLinks) + '\r\n'
-
-def compileLinkR(withoutBracketed=False, onlyBracketed=False):
- """Return a regex that matches external links."""
- # RFC 2396 says that URLs may only contain certain characters.
- # For this regex we also accept non-allowed characters, so that the bot
- # will later show these links as broken ('Non-ASCII Characters in URL').
- # Note: While allowing parenthesis inside URLs, MediaWiki will regard
- # right parenthesis at the end of the URL as not part of that URL.
- # The same applies to dot, comma, colon and some other characters.
- notAtEnd = '\]\s\)\.:;,<>"'
- # So characters inside the URL can be anything except whitespace,
- # closing squared brackets, quotation marks, greater than and less
- # than, and the last character also can't be parenthesis or another
- # character disallowed by MediaWiki.
- notInside = '\]\s<>"'
- # The first half of this regular expression is required because '' is
- # not allowed inside links. For example, in this wiki text:
- # ''Please see http://www.example.org.''
- # .'' shouldn't be considered as part of the link.
- regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \
- + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside \
- + ']*[^' + notAtEnd + '])'
-
- if withoutBracketed:
- regex = r'(?<!\[)' + regex
- elif onlyBracketed:
- regex = r'\[' + regex
- linkR = re.compile(regex)
- return linkR
-
# end of category specific code
def url2link(percentname, insite, site):
"""Convert urlname of a wiki page into interwiki link format.
@@ -8130,247 +7427,6 @@
# Set socket timeout
socket.setdefaulttimeout(config.socket_timeout)
-# Languages to use for comment text after the actual language but before
-# en:. For example, if for language 'xx', you want the preference of
-# languages to be:
-# xx:, then fr:, then ru:, then en:
-# you let altlang return ['fr','ru'].
-# This code is used by translate() below.
-
-def altlang(code):
- #Amharic
- if code in ['aa', 'om']:
- return ['am']
- #Arab
- if code in ['arc', 'arz']:
- return ['ar']
- if code == 'kab':
- return ['ar', 'fr']
- #Bulgarian
- if code in ['cu', 'mk']:
- return ['bg', 'sr', 'sh']
- #Czech
- if code in ['cs', 'sk']:
- return ['cs', 'sk']
- #German
- if code in ['bar', 'ksh', 'pdc']:
- return ['de']
- if code in ['als', 'lb']:
- return ['de', 'fr']
- if code == 'nds':
- return ['nds-nl', 'de']
- if code in ['dsb', 'hsb']:
- return ['hsb', 'dsb', 'de']
- if code == 'rm':
- return ['de', 'it']
- if code == 'stq':
- return ['fy', 'de']
- #Greek
- if code == 'pnt':
- return ['el']
- #Esperanto
- if code in ['io', 'nov']:
- return ['eo']
- #Spanish
- if code in ['an', 'ast', 'ay', 'ca', 'ext', 'lad', 'nah', 'nv', 'qu']:
- return ['es']
- if code in ['gl', 'gn']:
- return ['es', 'pt']
- if code == ['eu']:
- return ['es', 'fr']
- if code in ['bcl', 'cbk-zam', 'ceb', 'ilo', 'pag', 'pam', 'tl', 'war']:
- return ['es', 'tl']
- #Estonian
- if code == 'fiu-vro':
- return ['et']
- #Persian (Farsi)
- if code in ['glk', 'mzn']:
- return ['ar']
- #French
- if code in ['bm', 'br', 'ht', 'kab', 'kg', 'ln', 'mg', 'nrm', 'oc',
- 'pcd', 'rw', 'sg', 'ty', 'wa']:
- return ['fr']
- if code == 'co':
- return ['fr', 'it']
- #Hindi
- if code in ['bh', 'pi', 'sa']:
- return ['hi']
- if code in ['ne', 'new']:
- return ['ne', 'new', 'hi']
- #Indonesian and Malay
- if code in ['ace', 'bug', 'id', 'jv', 'ms', 'su']:
- return ['id', 'ms', 'jv']
- if code == 'map-bms':
- return ['jv', 'id', 'ms']
- #Inuit languages
- if code in ['ik', 'iu']:
- return ['iu', 'kl']
- if code == 'kl':
- return ['iu', 'da', 'no']
- #Italian
- if code in ['eml', 'fur', 'lij', 'lmo', 'nap', 'pms', 'roa-tara', 'sc',
- 'scn', 'vec']:
- return ['it']
- if code == 'frp':
- return ['it', 'fr']
- #Lithuanian
- if code in ['bat-smg', 'ltg']:
- return ['lt']
- #Dutch
- if code in ['fy', 'li', 'pap', 'srn', 'vls', 'zea']:
- return ['nl']
- if code == ['nds-nl']:
- return ['nds', 'nl']
- #Polish
- if code in ['csb', 'szl']:
- return ['pl']
- #Portuguese
- if code in ['fab', 'mwl', 'tet']:
- return ['pt']
- #Romanian
- if code in ['mo', 'roa-rup']:
- return ['ro']
- #Russian and Belarusian
- if code in ['ab', 'av', 'ba', 'bxr', 'ce', 'cv', 'kk', 'ky', 'lbe', 'mdf',
- 'mhr', 'myv', 'os', 'sah', 'tg', 'tt', 'udm', 'uk', 'xal']:
- return ['ru']
- if code in ['be', 'be-x-old']:
- return ['be', 'be-x-old', 'ru']
- if code == 'kaa':
- return ['uz', 'ru']
- #Serbocroatian
- if code in ['bs', 'hr', 'sh', 'sr']:
- return ['sh', 'hr', 'bs', 'sr']
- #Turkish and Kurdish
- if code in ['diq', 'ku']:
- return ['ku', 'tr']
- if code == 'ckb':
- return ['ku', 'ar']
- #Chinese
- if code in ['minnan', 'zh', 'zh-classical', 'zh-min-nan', 'zh-tw', 'zh-hans', 'zh-hant']:
- return ['zh', 'zh-tw', 'zh-cn', 'zh-classical']
- if code in ['cdo', 'gan', 'hak', 'ii', 'wuu', 'za', 'zh-cdo', 'zh-classical',
- 'zh-cn', 'zh-yue']:
- return ['zh', 'zh-cn', 'zh-tw', 'zh-classical']
- #Scandinavian languages
- if code in ['da', 'sv']:
- return ['da', 'no', 'nb', 'sv', 'nn']
- if code in ['fo', 'is']:
- return ['da', 'no', 'nb', 'nn', 'sv']
- if code == 'nn':
- return ['no', 'nb', 'sv', 'da']
- if code in ['nb', 'no']:
- return ['no', 'nb', 'da', 'nn', 'sv']
- if code == 'se':
- return ['sv', 'no', 'nb', 'nn', 'fi']
- #Other languages
- if code in ['bi', 'tpi']:
- return ['bi', 'tpi']
- if code == 'yi':
- return ['he', 'de']
- if code in ['ia', 'ie']:
- return ['ia', 'la', 'it', 'fr', 'es']
- #Default value
- return []
-
-def translate(code, xdict):
- """Return the most appropriate translation from a translation dict.
-
- Given a language code and a dictionary, returns the dictionary's value for
- key 'code' if this key exists; otherwise tries to return a value for an
- alternative language that is most applicable to use on the Wikipedia in
- language 'code'.
-
- The language itself is always checked first, then languages that
- have been defined to be alternatives, and finally English. If none of
- the options gives result, we just take the first language in the
- list.
-
- """
- # If a site is given instead of a code, use its language
- if hasattr(code,'lang'):
- code = code.lang
-
- if 'wikipedia' in xdict: # If xdict attribute is wikipedia, define the xdite had multiple projects
- if default_family in xdict:
- xdict = xdict[default_family]
- else:
- xdict = xdict['wikipedia']
-
- if type(xdict) != dict:
- return xdict
-
- if code in xdict:
- return xdict[code]
- for alt in altlang(code):
- if alt in xdict:
- return xdict[alt]
- if '_default' in xdict:
- return xdict['_default']
- elif 'en' in xdict:
- return xdict['en']
- return xdict.values()[0]
-
-def showDiff(oldtext, newtext):
- """
- Prints a string showing the differences between oldtext and newtext.
- The differences are highlighted (only on Unix systems) to show which
- changes were made.
- """
- # For information on difflib, see http://docs.python.org/library/difflib.html
- color = {
- '+': 'lightgreen',
- '-': 'lightred',
- }
- diff = u''
- colors = []
- # This will store the last line beginning with + or -.
- lastline = None
- # For testing purposes only: show original, uncolored diff
- # for line in difflib.ndiff(oldtext.splitlines(), newtext.splitlines()):
- # print line
- for line in difflib.ndiff(oldtext.splitlines(), newtext.splitlines()):
- if line.startswith('?'):
- # initialize color vector with None, which means default color
- lastcolors = [None for c in lastline]
- # colorize the + or - sign
- lastcolors[0] = color[lastline[0]]
- # colorize changed parts in red or green
- for i in range(min(len(line), len(lastline))):
- if line[i] != ' ':
- lastcolors[i] = color[lastline[0]]
- diff += lastline + '\n'
- # append one None (default color) for the newline character
- colors += lastcolors + [None]
- elif lastline:
- diff += lastline + '\n'
- # colorize the + or - sign only
- lastcolors = [None for c in lastline]
- lastcolors[0] = color[lastline[0]]
- colors += lastcolors + [None]
- lastline = None
- if line[0] in ('+', '-'):
- lastline = line
- # there might be one + or - line left that wasn't followed by a ? line.
- if lastline:
- diff += lastline + '\n'
- # colorize the + or - sign only
- lastcolors = [None for c in lastline]
- lastcolors[0] = color[lastline[0]]
- colors += lastcolors + [None]
-
- result = u''
- lastcolor = None
- for i in range(len(diff)):
- if colors[i] != lastcolor:
- if lastcolor is None:
- result += '\03{%s}' % colors[i]
- else:
- result += '\03{default}'
- lastcolor = colors[i]
- result += diff[i]
- output(result)
-
def writeToCommandLogFile():
"""
Save the name of the called module along with all parameters to
@@ -8696,7 +7752,7 @@
output( u'ERROR: %s caused error %s. Dump %s created.' % (name,error,filename) )
get_throttle = Throttle(config.minthrottle,config.maxthrottle)
-put_throttle = Throttle(config.put_throttle,config.put_throttle,False)
+put_throttle = Throttle(config.put_throttle,config.put_throttle,multiplydelay=False)
def decompress_gzip(data):
# Use cStringIO if available
Revision: 7956
Author: xqt
Date: 2010-02-24 13:24:37 +0000 (Wed, 24 Feb 2010)
Log Message:
-----------
query: Do not dump page text in verbose option
Modified Paths:
--------------
trunk/pywikipedia/query.py
Modified: trunk/pywikipedia/query.py
===================================================================
--- trunk/pywikipedia/query.py 2010-02-24 13:13:24 UTC (rev 7955)
+++ trunk/pywikipedia/query.py 2010-02-24 13:24:37 UTC (rev 7956)
@@ -77,7 +77,7 @@
wikipedia.output(u"%s: (%d items)" % (data.keys()[0], titlecount ) )
for k, v in params.iteritems():
- if k not in ['action', 'format', 'file', 'xml']:
+ if k not in ['action', 'format', 'file', 'xml', 'text']:
if k == 'lgpassword' and wikipedia.verbose == 1:
v = u'XXXXX'
elif not isinstance(v, unicode):