[Pywikipedia-l] SVN: [6257] branches/rewrite/pywikibot/page.py
russblau at svn.wikimedia.org
russblau at svn.wikimedia.org
Wed Jan 14 18:21:42 UTC 2009
Revision: 6257
Author: russblau
Date: 2009-01-14 18:21:42 +0000 (Wed, 14 Jan 2009)
Log Message:
-----------
Improve Link parsing, and fix minor category bugs
Modified Paths:
--------------
branches/rewrite/pywikibot/page.py
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py 2009-01-14 15:17:47 UTC (rev 6256)
+++ branches/rewrite/pywikibot/page.py 2009-01-14 18:21:42 UTC (rev 6257)
@@ -1049,7 +1049,6 @@
If newCat is None, the category will be removed.
""" # TODO: document remaining arguments
- cats = self.categories(get_redirect=True)
site = self.site()
changesMade = False
@@ -1092,6 +1091,7 @@
# and remove duplicates.
newCatList = []
newCatSet = set()
+ cats = list(self.categories(get_redirect=True))
for i in range(len(cats)):
cat = cats[i]
if cat == oldCat:
@@ -1295,8 +1295,7 @@
class Category(Page):
"""A page in the Category: namespace"""
- @deprecate_arg("sortKey", None)
- def __init__(self, source, title=u"", insite=None):
+ def __init__(self, source, title=u"", insite=None, sortKey=None):
"""All parameters are the same as for Page() constructor.
"""
@@ -1304,6 +1303,7 @@
if self.namespace() != 14:
raise ValueError(u"'%s' is not in the category namespace!"
% title)
+ self.sortKey = sortKey
@deprecate_arg("forceInterwiki", None)
@deprecate_arg("textlink", None)
@@ -1556,28 +1556,22 @@
"""
self._text = text
- self._source = source
+ self._source = source or pywikibot.Site()
self._defaultns = defaultNamespace
- def parse(self):
- """Parse text; called internally when accessing attributes"""
-
- # First remove the anchor, which is stored unchanged, if there is one
+ # preprocess text (these changes aren't site-dependent)
+ # First remove anchor, which is stored unchanged, if there is one
if u"|" in self._text:
self._text, self._anchor = self._text.split(u"|", 1)
else:
self._anchor = None
- if self._source is None:
- self._source = pywikibot.Site()
- self._site = self._source
-
# Clean up the name, it can come from anywhere.
# Convert HTML entities to unicode
t = html2unicode(self._text)
# Convert URL-encoded characters to unicode
- t = url2unicode(t, site=self._site)
+ t = url2unicode(t, site=self._source)
# Normalize unicode string to a NFC (composed) format to allow proper
# string comparisons. According to
@@ -1590,7 +1584,6 @@
#
if u'\ufffd' in t:
raise pywikibot.Error("Title contains illegal char (\\uFFFD)")
- self._namespace = self._defaultns
# Replace underscores by spaces
t = t.replace(u"_", u" ")
@@ -1600,9 +1593,63 @@
t = t.strip(" ")
# Remove left-to-right and right-to-left markers.
t = t.replace(u"\u200e", u"").replace(u"\u200f", u"")
+ self._text = t
+ def parse_site(self):
+ """Parse only enough text to determine the host site."""
+
+ t = self._text
+ self._site = self._source
firstPass = True
while u":" in t:
+ # Initial colon
+ if t.startswith(u":"):
+ # remove the colon but continue processing
+ # remove any subsequent whitespace
+ t = t.lstrip(u":").lstrip(u" ")
+ continue
+ fam = self._site.family
+ prefix = t[ :t.index(u":")].lower() # part of text before :
+ ns = self._site.ns_index(prefix)
+ if ns:
+ # Ordinary namespace
+ return
+ if prefix in fam.langs.keys()\
+ or prefix in fam.get_known_families(site=self._site):
+ # looks like an interwiki link
+ if not firstPass:
+ return
+ t = t[t.index(u":"): ].lstrip(u": ") # part of text after :
+ if prefix in fam.langs.keys():
+ newsite = pywikibot.Site(prefix, fam)
+ else:
+ otherlang = self._site.code
+ familyName = fam.get_known_families(site=self._site)[prefix]
+ if familyName in ['commons', 'meta']:
+ otherlang = familyName
+ try:
+ newsite = pywikibot.Site(otherlang, familyName)
+ except ValueError:
+ return
+ # Redundant interwiki prefix to the local wiki
+ if newsite == self._site:
+ firstPass = False
+ continue
+ self._site = newsite
+ else:
+ return # text before : doesn't match any known prefix
+
+ def parse(self):
+ """Parse text; called internally when accessing attributes"""
+
+ self._site = self._source
+ self._namespace = self._defaultns
+ t = self._text
+
+ # This code was adapted from Title.php : secureAndSplit()
+ #
+ firstPass = True
+ while u":" in t:
# Initial colon indicates main namespace rather than default
if t.startswith(u":"):
self._namespace = 0
@@ -1707,7 +1754,7 @@
@property
def site(self):
if not hasattr(self, "_site"):
- self.parse()
+ self.parse_site()
return self._site
@property
@@ -1734,6 +1781,14 @@
self.parse()
return self._anchor
+ def canonical_title(self):
+ """Return full page title, including localized namespace."""
+ if self.namespace:
+ return "%s:%s" % (self.site.namespace(self.namespace),
+ self.title)
+ else:
+ return self.title
+
def astext(self, onsite=None):
"""Return a text representation of the link.
@@ -1763,7 +1818,7 @@
title)
def __str__(self):
- return self.astext()
+ return self.astext().encode("ascii", "backslashreplace")
def __cmp__(self, other):
"""Test for equality and inequality of Link objects.
More information about the Pywikipedia-l
mailing list