Revision: 5020 Author: russblau Date: 2008-02-14 19:21:58 +0000 (Thu, 14 Feb 2008)
Log Message: ----------- String methods are more efficient than regexes; also minor cleanup of docstrings, long lines, typos.
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2008-02-14 17:29:46 UTC (rev 5019) +++ trunk/pywikipedia/wikipedia.py 2008-02-14 19:21:58 UTC (rev 5020) @@ -216,7 +216,9 @@ """Page: A MediaWiki page
Constructor has two required parameters: - 1) The wikimedia Site on which the page resides + 1) The wikimedia Site on which the page resides [note that, if the + title is in the form of an interwiki link, the Page object may + have a different Site than this] 2) The title of the page as a unicode string
Optional parameters: @@ -313,20 +315,26 @@ t = html2unicode(title)
# Convert URL-encoded characters to unicode - # Sometimes users copy the link to a site from one to another. Try both the source site and the destination site to decode. + # Sometimes users copy the link to a site from one to another. + # Try both the source site and the destination site to decode. t = url2unicode(t, site = insite, site2 = site)
- # Normalize unicode string to a NFC (composed) format to allow proper string comparisons - # According to http://svn.wikimedia.org/viewvc/mediawiki/branches/REL1_6/phase3/includes/no... - # the mediawiki code normalizes everything to NFC, not NFKC (which might result in information loss). + # Normalize unicode string to a NFC (composed) format to allow + # proper string comparisons. According to + # http://svn.wikimedia.org/viewvc/mediawiki/branches/REL1_6/phase3/includes/no... + # the mediawiki code normalizes everything to NFC, not NFKC + # (which might result in information loss). t = unicodedata.normalize('NFC', t)
# Clean up the name, it can come from anywhere. # Replace underscores by spaces, also multiple spaces and underscores with a single space + t = t.replace(u"_", u" ") + while u" " in t: + t = t.replace(u" ", u" ") # Strip spaces at both ends - t = re.sub('[ _]+', ' ', t).strip() + t = t.strip() # Remove left-to-right and right-to-left markers. - t = re.sub(u'\u200e|\u200f', '', t) + t = t.replace(u'\u200e', '').replace(u'\u200f', '') # leading colon implies main namespace instead of the default if t.startswith(':'): t = t[1:] @@ -334,6 +342,10 @@ else: self._namespace = defaultNamespace
+ if not t: + raise Error(u"Invalid title '%s'" % title ) + + self._namespace = defaultNamespace # # This code was adapted from Title.php : secureAndSplit() # @@ -392,22 +404,23 @@
sectionStart = t.find(u'#') if sectionStart >= 0: - self._section = t[sectionStart+1:].strip() - self._section = sectionencode(self._section, self.site().encoding()) - if self._section == u'': self._section = None - t = t[:sectionStart].strip() + self._section = t[sectionStart+1 : ].strip() + self._section = sectionencode(self._section, + self.site().encoding()) + if not self._section: + self._section = None + t = t[ : sectionStart].strip() else: self._section = None
- if len(t) > 0: + if t: if not self.site().nocapitalize: t = t[0].upper() + t[1:] - # else: - # output(u"DBG>>> Strange title: %s:%s" % (site.lang, title) )
+ # reassemble the title from its parts + if self._namespace != 0: t = self.site().namespace(self._namespace) + u':' + t - if self._section: t += u'#' + self._section
@@ -1354,7 +1367,7 @@ token = None, gettoken = True, sysop = sysop) if data.find("<title>Wikimedia Error</title>") > -1: output( - u"Wikimedia has technical problems; will retry in %i minute%s." + u"Wikimedia has technical problems; will retry in %i minute%s." % (retry_delay, retry_delay != 1 and "s" or "")) time.sleep(60 * retry_delay) retry_delay *= 2 @@ -1365,6 +1378,7 @@ # Something went wrong, and we don't know what. Show the # HTML code that hopefully includes some error message. output(u"ERROR: Unexpected response from wiki server.") + output(u" %s (%s) " % (response.status, response.reason)) output(data) return response.status, response.reason, data return response.status, response.reason, data @@ -3397,7 +3411,7 @@ html = [] for c in s: cord = ord(c) - if cord < 128: + if 31 < cord < 128: html.append(c) else: html.append('&#%d;'%cord) @@ -3413,7 +3427,7 @@ # create a list of all possible encodings for both hint sites encList = [site.encoding()] + list(site.encodings()) if site2 and site2 <> site: - encList.append(site.encoding()) + encList.append(site2.encoding()) encList += list(site2.encodings()) firstException = None # try to handle all encodings (will probably retry utf-8) @@ -5177,14 +5191,15 @@ _sites = {} _namespaceCache = {}
-def getSite(code = None, fam = None, user=None, persistent_http=None): +def getSite(code=None, fam=None, user=None, persistent_http=None): if code == None: code = default_code if fam == None: fam = default_family - key = '%s:%s:%s:%s'%(fam,code,user,persistent_http) + key = '%s:%s:%s:%s' % (fam, code, user, persistent_http) if not _sites.has_key(key): - _sites[key] = Site(code=code, fam=fam, user=user, persistent_http=persistent_http) + _sites[key] = Site(code=code, fam=fam, user=user, + persistent_http=persistent_http) return _sites[key]
def setSite(site): @@ -5224,7 +5239,6 @@ # I don't know how non-Western Windows versions behave. return unicode(arg, config.console_encoding)
- def handleArgs(): """Handle standard command line arguments, return the rest as a list.
pywikipedia-l@lists.wikimedia.org