[Pywikipedia-l] SVN: [6257] branches/rewrite/pywikibot/page.py

Wed Jan 14 18:21:42 UTC 2009

Revision: 6257
Author:   russblau
Date:     2009-01-14 18:21:42 +0000 (Wed, 14 Jan 2009)

Log Message:
-----------
Improve Link parsing, and fix minor category bugs

Modified Paths:
--------------
    branches/rewrite/pywikibot/page.py

Modified: branches/rewrite/pywikibot/page.py
===================================================================

--- branches/rewrite/pywikibot/page.py	2009-01-14 15:17:47 UTC (rev 6256)
+++ branches/rewrite/pywikibot/page.py	2009-01-14 18:21:42 UTC (rev 6257)
@@ -1049,7 +1049,6 @@
         If newCat is None, the category will be removed.
         
         """ # TODO: document remaining arguments
-        cats = self.categories(get_redirect=True)
         site = self.site()
         changesMade = False
 
@@ -1092,6 +1091,7 @@
         # and remove duplicates.
         newCatList = []
         newCatSet = set()
+        cats = list(self.categories(get_redirect=True))
         for i in range(len(cats)):
             cat = cats[i]
             if cat == oldCat:
@@ -1295,8 +1295,7 @@
 class Category(Page):
     """A page in the Category: namespace"""
 
-    @deprecate_arg("sortKey", None)
-    def __init__(self, source, title=u"", insite=None):
+    def __init__(self, source, title=u"", insite=None, sortKey=None):
         """All parameters are the same as for Page() constructor.
 
         """
@@ -1304,6 +1303,7 @@
         if self.namespace() != 14:
             raise ValueError(u"'%s' is not in the category namespace!"
                              % title)
+        self.sortKey = sortKey
 
     @deprecate_arg("forceInterwiki", None)
     @deprecate_arg("textlink", None)
@@ -1556,28 +1556,22 @@
 
         """
         self._text = text
-        self._source = source
+        self._source = source or pywikibot.Site()
         self._defaultns = defaultNamespace
 
-    def parse(self):
-        """Parse text; called internally when accessing attributes"""
-        
-        # First remove the anchor, which is stored unchanged, if there is one
+        # preprocess text (these changes aren't site-dependent)
+        # First remove anchor, which is stored unchanged, if there is one
         if u"|" in self._text:
             self._text, self._anchor = self._text.split(u"|", 1)
         else:
             self._anchor = None
 
-        if self._source is None:
-            self._source = pywikibot.Site()
-        self._site = self._source
-
         # Clean up the name, it can come from anywhere.
         # Convert HTML entities to unicode
         t = html2unicode(self._text)
 
         # Convert URL-encoded characters to unicode
-        t = url2unicode(t, site=self._site)
+        t = url2unicode(t, site=self._source)
 
         # Normalize unicode string to a NFC (composed) format to allow proper
         # string comparisons. According to
@@ -1590,7 +1584,6 @@
         #
         if u'\ufffd' in t:
             raise pywikibot.Error("Title contains illegal char (\\uFFFD)")
-        self._namespace = self._defaultns
 
         # Replace underscores by spaces
         t = t.replace(u"_", u" ")
@@ -1600,9 +1593,63 @@
         t = t.strip(" ")
         # Remove left-to-right and right-to-left markers.
         t = t.replace(u"\u200e", u"").replace(u"\u200f", u"")
+        self._text = t
 
+    def parse_site(self):
+        """Parse only enough text to determine the host site."""
+
+        t = self._text
+        self._site = self._source
         firstPass = True
         while u":" in t:
+            # Initial colon
+            if t.startswith(u":"):
+                # remove the colon but continue processing
+                # remove any subsequent whitespace
+                t = t.lstrip(u":").lstrip(u" ")
+                continue
+            fam = self._site.family
+            prefix = t[ :t.index(u":")].lower() # part of text before :
+            ns = self._site.ns_index(prefix)
+            if ns:
+                # Ordinary namespace
+                return
+            if prefix in fam.langs.keys()\
+                   or prefix in fam.get_known_families(site=self._site):
+                # looks like an interwiki link
+                if not firstPass:
+                    return
+                t = t[t.index(u":"): ].lstrip(u": ") # part of text after :
+                if prefix in fam.langs.keys():
+                    newsite = pywikibot.Site(prefix, fam)
+                else:
+                    otherlang = self._site.code
+                    familyName = fam.get_known_families(site=self._site)[prefix]
+                    if familyName in ['commons', 'meta']:
+                        otherlang = familyName
+                    try:
+                        newsite = pywikibot.Site(otherlang, familyName)
+                    except ValueError:
+                        return
+                # Redundant interwiki prefix to the local wiki
+                if newsite == self._site:
+                    firstPass = False
+                    continue
+                self._site = newsite
+            else:
+                return   # text before : doesn't match any known prefix
+
+    def parse(self):
+        """Parse text; called internally when accessing attributes"""
+        
+        self._site = self._source
+        self._namespace = self._defaultns
+        t = self._text
+
+        # This code was adapted from Title.php : secureAndSplit()
+        #
+        firstPass = True
+        while u":" in t:
             # Initial colon indicates main namespace rather than default
             if t.startswith(u":"):
                 self._namespace = 0
@@ -1707,7 +1754,7 @@
     @property
     def site(self):
         if not hasattr(self, "_site"):
-            self.parse()
+            self.parse_site()
         return self._site
 
     @property
@@ -1734,6 +1781,14 @@
             self.parse()
         return self._anchor
 
+    def canonical_title(self):
+        """Return full page title, including localized namespace."""
+        if self.namespace:
+            return "%s:%s" % (self.site.namespace(self.namespace),
+                              self.title)
+        else:
+            return self.title
+
     def astext(self, onsite=None):
         """Return a text representation of the link.
 
@@ -1763,7 +1818,7 @@
                                   title)
 
     def __str__(self):
-        return self.astext()
+        return self.astext().encode("ascii", "backslashreplace")
 
     def __cmp__(self, other):
         """Test for equality and inequality of Link objects.