Revision: 6184
Author: russblau
Date: 2008-12-22 15:42:59 +0000 (Mon, 22 Dec 2008)
Log Message:
-----------
Merge purodha's recent changes from trunk, fix long lines and spacing
Modified Paths:
--------------
branches/rewrite/pywikibot/textlib.py
Property Changed:
----------------
branches/rewrite/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py 2008-12-22 15:42:33 UTC (rev 6183)
+++ branches/rewrite/pywikibot/textlib.py 2008-12-22 15:42:59 UTC (rev 6184)
@@ -93,7 +93,9 @@
# images.
'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
'interwiki': re.compile(r'(?i)\[\[(%s)\s?:[^\]]*\]\][\s]*'
- % '|'.join(site.validLanguageLinks() +
site.family.obsolete.keys())),
+ % '|'.join(site.validLanguageLinks()
+ + site.family.obsolete.keys())
+ ),
}
@@ -132,25 +134,28 @@
excMatch.start() < nextExceptionMatch.start()):
nextExceptionMatch = excMatch
- if nextExceptionMatch is not None and nextExceptionMatch.start() <=
match.start():
- # an HTML comment or text in nowiki tags stands before the next valid match.
Skip.
+ if nextExceptionMatch is not None \
+ and nextExceptionMatch.start() <= match.start():
+ # an HTML comment or text in nowiki tags stands before the next
+ # valid match. Skip.
index = nextExceptionMatch.end()
else:
# We found a valid match. Replace it.
if callable(new):
- # the parameter new can be a function which takes the match as a
parameter.
+ # the parameter new can be a function which takes the match
+ # as a parameter.
replacement = new(match)
else:
# it is not a function, but a string.
- # it is a little hack to make \n work. It would be better to fix it
- # previously, but better than nothing.
+ # it is a little hack to make \n work. It would be better
+ # to fix it previously, but better than nothing.
new = new.replace('\\n', '\n')
# We cannot just insert the new string, as it may contain regex
# group references such as \2 or \g<name>.
- # On the other hand, this approach does not work because it can't
- # handle lookahead or lookbehind (see bug #1731008):
+ # On the other hand, this approach does not work because it
+ # can't handle lookahead or lookbehind (see bug #1731008):
#replacement = old.sub(new, text[match.start():match.end()])
#text = text[:match.start()] + replacement + text[match.end():]
@@ -162,8 +167,11 @@
groupMatch = groupR.search(replacement)
if not groupMatch:
break
- groupID = groupMatch.group('name') or
int(groupMatch.group('number'))
- replacement = replacement[:groupMatch.start()] + match.group(groupID)
+ replacement[groupMatch.end():]
+ groupID = (groupMatch.group('name')
+ or int(groupMatch.group('number')))
+ replacement = (replacement[:groupMatch.start()]
+ + match.group(groupID)
+ + replacement[groupMatch.end():])
text = text[:match.start()] + replacement + text[match.end():]
# continue the search on the remaining text
@@ -210,14 +218,45 @@
For the tags parameter, see removeDisabledParts() above.
"""
# Find a marker that is not already in the text.
- marker = '@@'
- while marker in text:
- marker += '@'
+ marker = findmarker(text, '@@', '@')
text = text[:index] + marker + text[index:]
text = removeDisabledParts(text, tags)
return (marker not in text)
+def findmarker(text, startwith = u'@', append = u'@'):
+ # find a string which is not part of text
+ if len(append) <= 0:
+ append = u'@'
+ mymarker = startwith
+ while mymarker in text:
+ mymarker += append
+ return mymarker
+
+
+def expandmarker(text, marker = '', separator = ''):
+ # set to remove any number of separator occurrences plus arbitrary
+ # whitespace before, after, and between them,
+ # by allowing to include them into marker.
+ if separator:
+ firstinmarker = text.find(marker)
+ firstinseparator = firstinmarker
+ lenseparator = len(separator)
+ striploopcontinue = True
+ while firstinseparator > 0 and striploopcontinue:
+ striploopcontinue = False
+ if ( (firstinseparator >= lenseparator) and
+ (separator ==
+ text[firstinseparator-lenseparator:firstinseparator])):
+ firstinseparator -= lenseparator
+ striploopcontinue = True
+ elif text[firstinseparator-1] < ' ':
+ firstinseparator -= 1
+ striploopcontinue = True
+ marker = text[firstinseparator:firstinmarker] + marker
+ return marker
+
+
# Functions dealing with interwiki language links
# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
@@ -289,11 +328,32 @@
interwikiR = re.compile(r'\[\[(%s)\s?:[^\]]*\]\][\s]*'
% languages, re.IGNORECASE)
text = replaceExcept(text, interwikiR, '',
- ['nowiki', 'comment', 'math',
'pre', 'source'], marker=marker)
+ ['nowiki', 'comment', 'math',
'pre', 'source'],
+ marker=marker)
return text.strip()
-def replaceLanguageLinks(oldtext, new, site = None):
+def removeLanguageLinksAndSeparator(text, site = None, marker = '', separator =
''):
+ """
+ Return text with all interlanguage links, plus any preceeding whitespace
+ and separateor occurrences removed.
+
+ If a link to an unknown language is encountered, a warning is printed.
+ If a marker is defined, that string is placed at the location of the
+ last occurence of an interwiki link (at the end if there are no
+ interwiki links).
+
+ """
+ if separator:
+ mymarker = findmarker(text, u'@L@')
+ newtext = removeLanguageLinks(text, site, mymarker)
+ mymarker = expandmarker(newtext, mymarker, separator)
+ return newtext.replace(mymarker, marker)
+ else:
+ return removeLanguageLinks(text, site, marker)
+
+
+def replaceLanguageLinks(oldtext, new, site = None, addOnly = False):
"""Replace interlanguage links in the text with a new set of links.
'new' should be a dict with the Site objects as keys, and Page objects
@@ -302,31 +362,45 @@
"""
# Find a marker that is not already in the text.
- marker = '@@'
- while marker in oldtext:
- marker += '@'
+ marker = findmarker( oldtext, u'@@')
if site == None:
site = pywikibot.getSite()
+ separator = site.family.interwiki_text_separator
+ cseparator = site.family.category_text_separator
+ separatorstripped = separator.strip()
+ cseparatorstripped = cseparator.strip()
+ if addOnly:
+ s2 = oldtext
+ else:
+ s2 = removeLanguageLinksAndSeparator(oldtext, site=site, marker=marker,
+ separator=separatorstripped)
s = interwikiFormat(new, insite = site)
- s2 = removeLanguageLinks(oldtext, site = site, marker = marker)
if s:
separator = site.family.interwiki_text_separator
if site.language() in site.family.interwiki_attop:
newtext = s + separator + s2.replace(marker,'').strip()
else:
# calculate what was after the language links on the page
- firstafter = s2.find(marker) + len(marker)
+ firstafter = s2.find(marker)
+ if firstafter < 0:
+ firstafter = len(s2)
+ else:
+ firstafter += len(marker)
# Any text in 'after' part that means we should keep it after?
if "</noinclude>" in s2[firstafter:]:
- newtext = s2[:firstafter] + s + s2[firstafter:]
+ if separatorstripped:
+ s = separator + s
+ newtext = s2[:firstafter].replace(marker,'') + s \
+ + s2[firstafter:]
elif site.language() in site.family.categories_last:
cats = getCategoryLinks(s2, site = site)
- s2 = removeCategoryLinks(s2.replace(marker,'').strip(),
- site) + separator + s
- newtext = replaceCategoryLinks(s2, cats, site=site)
+ s2 = removeCategoryLinksAndSeparator(
+ s2.replace(marker, '', cseparatorstripped).strip(),
+ site) + separator + s
+ newtext = replaceCategoryLinks(s2, cats, site=site,
+ addOnly=True)
else:
newtext = s2.replace(marker,'').strip() + separator + s
- newtext = newtext.replace(marker,'')
else:
newtext = s2.replace(marker,'')
return newtext
@@ -385,7 +459,8 @@
del sites[sites.index(site)]
firstsites = firstsites + [site]
sites = firstsites + sites
- if insite.interwiki_putfirst_doubled(sites): #some implementations return False
+ if insite.interwiki_putfirst_doubled(sites):
+ #some (all?) implementations return False
sites = insite.interwiki_putfirst_doubled(sites) + sites
return sites
@@ -420,7 +495,7 @@
"""Return text with all category links removed.
Put the string marker after the last replacement (at the end of the text
- if there is no replacement).
+ if there is no replacement).
"""
# This regular expression will find every link that is possibly an
@@ -429,13 +504,34 @@
# ASCII letters and hyphens.
catNamespace = '|'.join(site.category_namespaces())
categoryR = re.compile(r'\[\[\s*(%s)\s*:.*?\]\]\s*' % catNamespace, re.I)
- text = replaceExcept(text, categoryR, '', ['nowiki',
'comment', 'math', 'pre', 'source'], marker = marker)
+ text = replaceExcept(text, categoryR, '',
+ ['nowiki', 'comment', 'math',
'pre', 'source'],
+ marker=marker)
if marker:
#avoid having multiple linefeeds at the end of the text
- text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker,
text.strip())
+ text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker,
+ text.strip())
return text.strip()
+def removeCategoryLinksAndSeparator(text, site=None, marker='',
separator=''):
+ """
+ Return text with all category links, plus any preceeding whitespace
+ and separateor occurrences removed.
+
+ Put the string marker after the last replacement (at the end of the text
+ if there is no replacement).
+
+ """
+ if separator:
+ mymarker = findmarker(text, u'@C@')
+ newtext = removeCategoryLinks(text, site, mymarker)
+ mymarker = expandmarker(newtext, mymarker, separator)
+ return newtext.replace(mymarker, marker)
+ else:
+ return removeCategoryLinks(text, site, marker)
+
+
def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None):
"""Replace the category oldcat with the category newcat and return
the modified text.
@@ -453,7 +549,7 @@
# title might not be capitalized correctly on the wiki
if title[0].isalpha() and not site.nocapitalize:
title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:]
- # spaces and underscores in page titles are interchangeable, and collapsible
+ # spaces and underscores in page titles are interchangeable and collapsible
title = title.replace(r"\ ", "[ _]+").replace(r"\_",
"[ _]+")
categoryR = re.compile(r'\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])'
% (catNamespace, title), re.I)
@@ -470,31 +566,34 @@
def replaceCategoryLinks(oldtext, new, site = None, addOnly = False):
"""Replace the category links given in the wikitext given
- in oldtext by the new links given in new.
+ in oldtext by the new links given in new.
- 'new' should be a list of Category objects.
+ 'new' should be a list of Category objects.
- If addOnly is True, the old category won't be deleted and
- the category(s) given will be added
- (and so they won't replace anything).
+ If addOnly is True, the old category won't be deleted andthe
+ category(s) given will be added (and so they won't replace anything).
+
"""
-
# Find a marker that is not already in the text.
- marker = '@@'
- while marker in oldtext:
- marker += '@'
-
+ marker = findmarker( oldtext, u'@@')
if site is None:
site = pywikibot.getSite()
if site.sitename() == 'wikipedia:de' and "{{Personendaten" in
oldtext:
- raise Error('The PyWikipediaBot is no longer allowed to touch categories on
the German Wikipedia on pages that contain the person data template because of the
non-standard placement of that template. See
http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006…)
-
- s = categoryFormat(new, insite = site)
+ raise Error("""\
+The PyWikipediaBot is no longer allowed to touch categories on the German
+Wikipedia on pages that contain the Personendaten template because of the
+non-standard placement of that template.
+See
http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006…)
+ separator = site.family.category_text_separator
+ iseparator = site.family.interwiki_text_separator
+ separatorstripped = separator.strip()
+ iseparatorstripped = iseparator.strip()
if addOnly:
s2 = oldtext
else:
- s2 = removeCategoryLinks(oldtext, site = site, marker = marker)
-
+ s2 = removeCategoryLinksAndSeparator(oldtext, site=site, marker=marker,
+ separator=separatorstripped)
+ s = categoryFormat(new, insite = site)
if s:
separator = site.family.category_text_separator
if site.language() in site.family.category_attop:
@@ -502,20 +601,28 @@
else:
# calculate what was after the categories links on the page
firstafter = s2.find(marker)
- # Any text in 'after' part that means we should keep it after?
+ if firstafter < 0:
+ firstafter = len(s2)
+ else:
+ firstafter += len(marker)
+ # Is there text in the 'after' part that means we should keep it
+ # after?
if "</noinclude>" in s2[firstafter:]:
- newtext = s2[:firstafter] + s + s2[firstafter:]
+ if separatorstripped:
+ s = separator + s
+ newtext = (s2[:firstafter].replace(marker,'') + s
+ + s2[firstafter:])
elif site.language() in site.family.categories_last:
newtext = s2.replace(marker,'').strip() + separator + s
else:
interwiki = getLanguageLinks(s2)
- s2 = removeLanguageLinks(s2.replace(marker,''), site
- ) + separator + s
- newtext = replaceLanguageLinks(s2, interwiki, site)
- newtext = newtext.replace(marker,'')
+ s2 = removeLanguageLinksAndSeparator(
+ s2.replace(marker,''), site, '',
iseparatorstripped
+ ) + separator + s
+ newtext = replaceLanguageLinks(s2, interwiki, site=site,
+ addOnly=True)
else:
- s2 = s2.replace(marker,'')
- return s2
+ newtext = s2.replace(marker,'')
return newtext.strip()
@@ -559,7 +666,9 @@
# not allowed inside links. For example, in this wiki text:
# ''Please see
http://www.example.org.''
# .'' shouldn't be considered as part of the link.
- regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' +
notAtEnd + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside
+ ']*[^' + notAtEnd + '])'
+ regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' +
notAtEnd \
+ + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' +
notInside \
+ + ']*[^' + notAtEnd + '])'
if withoutBracketed:
regex = r'(?<!\[)' + regex
@@ -568,6 +677,7 @@
linkR = re.compile(regex)
return linkR
+
def extract_templates_and_params(text, get_redirect=False):
"""Return list of template calls found in text.
Property changes on: branches/rewrite/pywikibot/textlib.py
___________________________________________________________________
Added: svn:mergeinfo
+ /trunk/pywikipedia/wikipedia.py:6154-6179