Pywikipedia-svn

pywikipedia-svn@lists.wikimedia.org

5163 discussions

SVN: [6861] trunk/pywikipedia/families
by multichill＠svn.wikimedia.org 09 May '09

09 May '09

Revision: 6861 Author: multichill Date: 2009-05-09 10:09:55 +0000 (Sat, 09 May 2009) Log Message: ----------- Add dbName for Commons family Fix dbName (+ _p) fro Wikipedia family Modified Paths: -------------- trunk/pywikipedia/families/commons_family.py trunk/pywikipedia/families/wikipedia_family.py Modified: trunk/pywikipedia/families/commons_family.py =================================================================== --- trunk/pywikipedia/families/commons_family.py 2009-05-08 18:54:53 UTC (rev 6860) +++ trunk/pywikipedia/families/commons_family.py 2009-05-09 10:09:55 UTC (rev 6861) @@ -56,6 +56,9 @@ def version(self, code): return '1.15alpha' + def dbName(self, code): + return 'commonswiki_p' + def shared_image_repository(self, code): return ('commons', 'commons') Modified: trunk/pywikipedia/families/wikipedia_family.py =================================================================== --- trunk/pywikipedia/families/wikipedia_family.py 2009-05-08 18:54:53 UTC (rev 6860) +++ trunk/pywikipedia/families/wikipedia_family.py 2009-05-09 10:09:55 UTC (rev 6861) @@ -1083,7 +1083,7 @@ # returns the name of the MySQL database # for historic reasons, the databases are called xxwiki instead of # xxwikipedia for Wikipedias. - return '%swiki' % code + return '%swiki_p' % code def code2encodings(self, code): """Return a list of historical encodings for a specific language

1 0

SVN: [6860] branches/rewrite/pywikibot
by russblau＠svn.wikimedia.org 08 May '09

08 May '09

Revision: 6860 Author: russblau Date: 2009-05-08 18:54:53 +0000 (Fri, 08 May 2009) Log Message: ----------- Fix SSL connection for sites that use non-standard paths Modified Paths: -------------- branches/rewrite/pywikibot/comms/http.py branches/rewrite/pywikibot/families/commons_family.py branches/rewrite/pywikibot/families/incubator_family.py branches/rewrite/pywikibot/families/mediawiki_family.py branches/rewrite/pywikibot/families/meta_family.py branches/rewrite/pywikibot/families/species_family.py branches/rewrite/pywikibot/family.py Modified: branches/rewrite/pywikibot/comms/http.py =================================================================== --- branches/rewrite/pywikibot/comms/http.py 2009-05-08 18:37:38 UTC (rev 6859) +++ branches/rewrite/pywikibot/comms/http.py 2009-05-08 18:54:53 UTC (rev 6860) @@ -93,15 +93,16 @@ if ssl: proto = "https" host = site.ssl_hostname() + uri = site.ssl_pathprefix() + uri else: - proto = "http" + proto = site.protocol() host = site.hostname() - full_uri = "%(proto)s://%(host)s%(uri)s" % locals() - + baseuri = urlparse.urljoin("%(proto)s://%(host)s" % locals(), uri) + # set default user-agent string kwargs.setdefault("headers", {}) kwargs["headers"].setdefault("user-agent", useragent) - request = threadedhttp.HttpRequest(full_uri, *args, **kwargs) + request = threadedhttp.HttpRequest(baseuri, *args, **kwargs) http_queue.put(request) request.lock.acquire() Modified: branches/rewrite/pywikibot/families/commons_family.py =================================================================== --- branches/rewrite/pywikibot/families/commons_family.py 2009-05-08 18:37:38 UTC (rev 6859) +++ branches/rewrite/pywikibot/families/commons_family.py 2009-05-08 18:54:53 UTC (rev 6860) @@ -43,3 +43,7 @@ def shared_image_repository(self, code): return ('commons', 'commons') + + def ssl_pathprefix(self, code): + return "/wikipedia/commons" + Modified: branches/rewrite/pywikibot/families/incubator_family.py =================================================================== --- branches/rewrite/pywikibot/families/incubator_family.py 2009-05-08 18:37:38 UTC (rev 6859) +++ branches/rewrite/pywikibot/families/incubator_family.py 2009-05-08 18:54:53 UTC (rev 6860) @@ -19,3 +19,7 @@ def shared_image_repository(self, code): return ('commons', 'commons') + + def ssl_pathprefix(self, code): + return "/wikipedia/incubator" + Modified: branches/rewrite/pywikibot/families/mediawiki_family.py =================================================================== --- branches/rewrite/pywikibot/families/mediawiki_family.py 2009-05-08 18:37:38 UTC (rev 6859) +++ branches/rewrite/pywikibot/families/mediawiki_family.py 2009-05-08 18:54:53 UTC (rev 6860) @@ -21,3 +21,6 @@ def shared_image_repository(self, code): return ('commons', 'commons') + + def ssl_pathprefix(self, code): + return "/wikipedia/mediawiki" Modified: branches/rewrite/pywikibot/families/meta_family.py =================================================================== --- branches/rewrite/pywikibot/families/meta_family.py 2009-05-08 18:37:38 UTC (rev 6859) +++ branches/rewrite/pywikibot/families/meta_family.py 2009-05-08 18:54:53 UTC (rev 6860) @@ -20,3 +20,6 @@ def shared_image_repository(self, code): return ('commons', 'commons') + + def ssl_pathprefix(self, code): + return "/wikipedia/meta" Modified: branches/rewrite/pywikibot/families/species_family.py =================================================================== --- branches/rewrite/pywikibot/families/species_family.py 2009-05-08 18:37:38 UTC (rev 6859) +++ branches/rewrite/pywikibot/families/species_family.py 2009-05-08 18:54:53 UTC (rev 6860) @@ -20,3 +20,6 @@ def shared_image_repository(self, code): return ('commons', 'commons') + + def ssl_pathprefix(self, code): + return "/wikipedia/species" Modified: branches/rewrite/pywikibot/family.py =================================================================== --- branches/rewrite/pywikibot/family.py 2009-05-08 18:37:38 UTC (rev 6859) +++ branches/rewrite/pywikibot/family.py 2009-05-08 18:54:53 UTC (rev 6860) @@ -809,7 +809,7 @@ def ssl_hostname(self, code): """The hostname to use for SSL connections.""" - return "secure.wikimedia.org/%s/%s" % (self.name, code) + return "secure.wikimedia.org" def scriptpath(self, code): """The prefix used to locate scripts on this wiki. @@ -825,6 +825,10 @@ """ return '/w' + def ssl_pathprefix(self, code): + """The path prefix for secure.wikimedia.org access.""" + return "/%s/%s" % (self.name, code) + def path(self, code): return '%s/index.php' % self.scriptpath(code)

1 0

SVN: [6859] branches/rewrite/pywikibot/families
by russblau＠svn.wikimedia.org 08 May '09

08 May '09

Revision: 6859 Author: russblau Date: 2009-05-08 18:37:38 +0000 (Fri, 08 May 2009) Log Message: ----------- Merge family file changes from trunk Modified Paths: -------------- branches/rewrite/pywikibot/families/incubator_family.py branches/rewrite/pywikibot/families/meta_family.py branches/rewrite/pywikibot/families/species_family.py branches/rewrite/pywikibot/families/wikipedia_family.py Property Changed: ---------------- branches/rewrite/pywikibot/families/ Property changes on: branches/rewrite/pywikibot/families ___________________________________________________________________ Modified: svn:mergeinfo - /trunk/pywikipedia/families:5896-6197,6212-6649 + /trunk/pywikipedia/families:5896-6197,6212-6858 Modified: branches/rewrite/pywikibot/families/incubator_family.py =================================================================== --- branches/rewrite/pywikibot/families/incubator_family.py 2009-05-08 15:23:29 UTC (rev 6858) +++ branches/rewrite/pywikibot/families/incubator_family.py 2009-05-08 18:37:38 UTC (rev 6859) @@ -15,7 +15,7 @@ } def version(self, code): - return '1.13alpha' + return '1.15alpha' def shared_image_repository(self, code): return ('commons', 'commons') Modified: branches/rewrite/pywikibot/families/meta_family.py =================================================================== --- branches/rewrite/pywikibot/families/meta_family.py 2009-05-08 15:23:29 UTC (rev 6858) +++ branches/rewrite/pywikibot/families/meta_family.py 2009-05-08 18:37:38 UTC (rev 6859) @@ -16,7 +16,7 @@ self.interwiki_forward = 'wikipedia' def version(self,code): - return '1.13alpha' + return '1.15alpha' def shared_image_repository(self, code): return ('commons', 'commons') Modified: branches/rewrite/pywikibot/families/species_family.py =================================================================== --- branches/rewrite/pywikibot/families/species_family.py 2009-05-08 15:23:29 UTC (rev 6858) +++ branches/rewrite/pywikibot/families/species_family.py 2009-05-08 18:37:38 UTC (rev 6859) @@ -16,7 +16,7 @@ self.interwiki_forward = 'wikipedia' def version(self,code): - return '1.13alpha' + return '1.15alpha' def shared_image_repository(self, code): return ('commons', 'commons') Modified: branches/rewrite/pywikibot/families/wikipedia_family.py =================================================================== --- branches/rewrite/pywikibot/families/wikipedia_family.py 2009-05-08 15:23:29 UTC (rev 6858) +++ branches/rewrite/pywikibot/families/wikipedia_family.py 2009-05-08 18:37:38 UTC (rev 6859) @@ -138,11 +138,13 @@ 'bn': [u'দ্ব্যর্থতা নিরসন', u'Disambig'], 'br': [u'Hvlstumm', u'Digejañ', u'Digejañ anvioù-badez'], 'bs': [u'Čvor'], - 'ca': [u'Desambiguació', u'Disambig', u'Desambigua'], + 'ca': [u'Desambiguació', u'Disambig', u'Desambigua', + u'acrònim'], 'ceb': [u'Giklaro'], 'cdo': [u'Gì-ngiê'], 'cs': [u'Rozcestník', u'Rozcestník - 2 znaky', u'Rozcestník - Příjmení', - u'Rozcestník - místopisné jméno', u'Disambig', u'Rozcestník - příjmení',], + u'Rozcestník - místopisné jméno', u'Disambig', u'Rozcestník - příjmení', + u'Rozcestník - sakrální stavba', u'Rozcestník - kostel'], 'cu': [u'Мъногосъмыслиѥ', u'Disambig'], 'cy': [u'Anamrwysedd', u'Disambig', u'Gwahaniaethu'], 'da': [u'Flertydig'], @@ -175,7 +177,7 @@ 'haw': [u'Huaʻōlelo puana like'], 'he': [u'פירושונים', u'Disambig'], 'hi': [u'बहुविकल्पी शब्द', u'Disambig',], - 'hr': [u'Disambig', u'Razdvojba'], + 'hr': [u'Disambig', u'Razdvojba', u'razdvojba1'], 'hsb': [u'Wjacezmyslnosć', u'Disambig'], 'ht': [u'Menm non', u'Disambig'], 'hu': [u'Egyert', u'Disambig', u'Egyért', u'Egyért-redir'], @@ -247,7 +249,8 @@ 'th': [u'แก้กำกวม', u'Disambig', u'คำกำกวม'], 'tl': [u'Paglilinaw', u'Disambig'], 'tr': [u'Anlam ayrım', u'Disambig', u'Anlam ayrımı', - u'Kişi adları (anlam ayrımı)', u'Yerleşim yerleri (anlam ayrımı)'], + u'Kişi adları (anlam ayrımı)', u'Yerleşim yerleri (anlam ayrımı)', + u'kısaltmalar (anlam ayrımı)'], 'uk': [u'Неоднозначність',u'DisambigG', u'Disambig', u'DisambigN', u'Багатозначність'], 'vec': [u'Disambigua'],

1 0

SVN: [6858] trunk/pywikipedia
by nicdumz＠svn.wikimedia.org 08 May '09

08 May '09

Revision: 6858 Author: nicdumz Date: 2009-05-08 15:23:29 +0000 (Fri, 08 May 2009) Log Message: ----------- BeautifulSoup 3.1.0.1 + relevant wikipedia.py fix Modified Paths: -------------- trunk/pywikipedia/BeautifulSoup.py trunk/pywikipedia/wikipedia.py Modified: trunk/pywikipedia/BeautifulSoup.py =================================================================== --- trunk/pywikipedia/BeautifulSoup.py 2009-05-08 07:52:49 UTC (rev 6857) +++ trunk/pywikipedia/BeautifulSoup.py 2009-05-08 15:23:29 UTC (rev 6858) @@ -42,7 +42,7 @@ Here, have some legalese: -Copyright (c) 2004-2007, Leonard Richardson +Copyright (c) 2004-2009, Leonard Richardson All rights reserved. @@ -79,27 +79,38 @@ from __future__ import generators __author__ = "Leonard Richardson (leonardr(a)segfault.org)" -__version__ = "3.0.6" -__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson" +__version__ = "3.1.0.1" +__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" __license__ = "New-style BSD" -from sgmllib import SGMLParser, SGMLParseError import codecs +import markupbase import types import re -import sgmllib +from HTMLParser import HTMLParser, HTMLParseError try: - from htmlentitydefs import name2codepoint + from htmlentitydefs import name2codepoint except ImportError: - name2codepoint = {} + name2codepoint = {} +try: + set +except NameError: + from sets import Set as set -#This hack makes Beautiful Soup able to parse XML with namespaces -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') +#These hacks make Beautiful Soup able to parse XML with namespaces +markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match DEFAULT_OUTPUT_ENCODING = "utf-8" # First, the classes that represent markup elements. +def sob(unicode, encoding): + """Returns either the given Unicode string or its encoding.""" + if encoding is None: + return unicode + else: + return unicode.encode(encoding) + class PageElement: """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -391,8 +402,20 @@ class NavigableString(unicode, PageElement): + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + return unicode.__new__(cls, value) + return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + def __getnewargs__(self): - return (NavigableString.__str__(self),) + return (unicode(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -403,34 +426,32 @@ else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - def __unicode__(self): - return str(self).decode(DEFAULT_OUTPUT_ENCODING) + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.decode().encode(encoding) - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - if encoding: - return self.encode(encoding) - else: - return self + def decodeGivenEventualEncoding(self, eventualEncoding): + return self class CData(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<![CDATA[' + self + u']]>' class ProcessingInstruction(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + + def decodeGivenEventualEncoding(self, eventualEncoding): output = self - if "%SOUP-ENCODING%" in output: - output = self.substituteEncoding(output, encoding) - return "<?%s?>" % self.toEncoding(output, encoding) + if u'%SOUP-ENCODING%' in output: + output = self.substituteEncoding(output, eventualEncoding) + return u'<?' + output + u'?>' class Comment(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'' class Declaration(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "<!%s>" % NavigableString.__str__(self, encoding) + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<!' + self + u'>' class Tag(PageElement): @@ -496,11 +517,13 @@ self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities - # Convert any HTML, XML, or numeric entities in the attribute values. - convert = lambda(k, val): (k, - re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", - self._convertEntities, - val)) + def convert(kval): + "Converts HTML, XML and numeric entities in the attribute value." + k, val = kval + if val is None: + return kval + return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", + self._convertEntities, val)) self.attrs = map(convert, self.attrs) def get(self, key, default=None): @@ -591,11 +614,8 @@ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" - return self.__str__(encoding) + return self.decode(eventualEncoding=encoding) - def __unicode__(self): - return self.__str__(None) - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") @@ -605,24 +625,30 @@ appropriate XML entity for an XML special character.""" return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding. + def __unicode__(self): + return self.decode() - NOTE: since Python's HTML parser consumes whitespace, this - method is not certain to reproduce the whitespace present in - the original string.""" + def __str__(self): + return self.encode() - encodedName = self.toEncoding(self.name, encoding) + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) + def decode(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): + """Returns a string or Unicode representation of this tag and + its contents. To get Unicode, pass None for encoding.""" + attrs = [] if self.attrs: for key, val in self.attrs: fmt = '%s="%s"' if isString(val): - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: - val = self.substituteEncoding(val, encoding) + if (self.containsSubstitutions + and eventualEncoding is not None + and '%SOUP-ENCODING%' in val): + val = self.substituteEncoding(val, eventualEncoding) # The attribute value either: # @@ -651,22 +677,26 @@ # ampersands that aren't part of entities. We need # to escape those to XML entities too. val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) - - attrs.append(fmt % (self.toEncoding(key, encoding), - self.toEncoding(val, encoding))) + if val is None: + # Handle boolean attributes. + decoded = key + else: + decoded = fmt % (key, val) + attrs.append(decoded) close = '' closeTag = '' if self.isSelfClosing: close = ' /' else: - closeTag = '</%s>' % encodedName + closeTag = '</%s>' % self.name indentTag, indentContents = 0, 0 if prettyPrint: indentTag = indentLevel space = (' ' * (indentTag-1)) indentContents = indentTag + 1 - contents = self.renderContents(encoding, prettyPrint, indentContents) + contents = self.decodeContents(prettyPrint, indentContents, + eventualEncoding) if self.hidden: s = contents else: @@ -676,7 +706,7 @@ attributeString = ' ' + ' '.join(attrs) if prettyPrint: s.append(space) - s.append('<%s%s%s>' % (encodedName, attributeString, close)) + s.append('<%s%s%s>' % (self.name, attributeString, close)) if prettyPrint: s.append("\n") s.append(contents) @@ -701,19 +731,23 @@ self.extract() def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.__str__(encoding, True) + return self.encode(encoding, True) - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): + return self.decodeContents(prettyPrint, indentLevel).encode(encoding) + + def decodeContents(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] for c in self: text = None if isinstance(c, NavigableString): - text = c.__str__(encoding) + text = c.decodeGivenEventualEncoding(eventualEncoding) elif isinstance(c, Tag): - s.append(c.__str__(encoding, prettyPrint, indentLevel)) + s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) if text and prettyPrint: text = text.strip() if text: @@ -754,7 +788,7 @@ return self._findAll(name, attrs, text, limit, generator, **kwargs) findChildren = findAll - # Pre-3.x compatibility methods + # Pre-3.x compatibility methods. Will go away in 4.0. first = find fetch = findAll @@ -764,6 +798,15 @@ def firstText(self, text=None, recursive=True): return self.find(text=text, recursive=recursive) + # 3.x compatibility methods. Will go away in 4.0. + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if encoding is None: + return self.decodeContents(prettyPrint, indentLevel, encoding) + else: + return self.encodeContents(encoding, prettyPrint, indentLevel) + + #Private methods def _getAttrMap(self): @@ -776,26 +819,24 @@ return self.attrMap #Generator methods + def recursiveChildGenerator(self): + if not len(self.contents): + raise StopIteration + stopNode = self._lastRecursiveChild().next + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next + def childGenerator(self): - for i in range(0, len(self.contents)): - yield self.contents[i] + if not len(self.contents): + raise StopIteration + current = self.contents[0] + while current: + yield current + current = current.nextSibling raise StopIteration - def recursiveChildGenerator(self): - stack = [(self, 0)] - while stack: - tag, start = stack.pop() - if isinstance(tag, Tag): - for i in range(start, len(tag.contents)): - a = tag.contents[i] - yield a - if isinstance(a, Tag) and tag.contents: - if i < len(tag.contents) - 1: - stack.append((tag, i+1)) - stack.append((a, 0)) - break - raise StopIteration - # Next, a couple classes to represent queries and their results. class SoupStrainer: """Encapsulates a number of ways of matching a markup element (tag or @@ -896,13 +937,14 @@ #other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name - if markup and not isString(markup): + if markup is not None and not isString(markup): markup = unicode(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. result = markup and matchAgainst.search(markup) - elif isList(matchAgainst): + elif (isList(matchAgainst) + and (markup is not None or not isString(matchAgainst))): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): result = markup.has_key(matchAgainst) @@ -928,8 +970,8 @@ def isList(l): """Convenience method that works with all 2.x versions of Python to determine whether or not something is listlike.""" - return hasattr(l, '__iter__') \ - or (type(l) in (types.ListType, types.TupleType)) + return ((hasattr(l, '__iter__') and not isString(l)) + or (type(l) in (types.ListType, types.TupleType))) def isString(s): """Convenience method that works with all 2.x versions of Python @@ -949,7 +991,7 @@ #It's a map. Merge it. for k,v in portion.items(): built[k] = v - elif isList(portion): + elif isList(portion) and not isString(portion): #It's a list. Map each item to the default. for k in portion: built[k] = default @@ -960,8 +1002,123 @@ # Now, the parser classes. -class BeautifulStoneSoup(Tag, SGMLParser): +class HTMLParserBuilder(HTMLParser): + def __init__(self, soup): + HTMLParser.__init__(self) + self.soup = soup + + # We inherit feed() and reset(). + + def handle_starttag(self, name, attrs): + if name == 'meta': + self.soup.extractCharsetFromMeta(attrs) + else: + self.soup.unknown_starttag(name, attrs) + + def handle_endtag(self, name): + self.soup.unknown_endtag(name) + + def handle_data(self, content): + self.soup.handle_data(content) + + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.soup.endData() + self.handle_data(text) + self.soup.endData(subclass) + + def handle_pi(self, text): + """Handle a processing instruction as a ProcessingInstruction + object, possibly one with a %SOUP-ENCODING% slot into which an + encoding will be plugged later.""" + if text[:3] == "xml": + text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + self._toStringSubclass(text, ProcessingInstruction) + + def handle_comment(self, text): + "Handle comments as Comment objects." + self._toStringSubclass(text, Comment) + + def handle_charref(self, ref): + "Handle character references as data." + if self.soup.convertEntities: + data = unichr(int(ref)) + else: + data = '&#%s;' % ref + self.handle_data(data) + + def handle_entityref(self, ref): + """Handle entity references as data, possibly converting known + HTML and/or XML entity references to the corresponding Unicode + characters.""" + data = None + if self.soup.convertHTMLEntities: + try: + data = unichr(name2codepoint[ref]) + except KeyError: + pass + + if not data and self.soup.convertXMLEntities: + data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + + if not data and self.soup.convertHTMLEntities and \ + not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # TODO: We've got a problem here. We're told this is + # an entity reference, but it's not an XML entity + # reference or an HTML entity reference. Nonetheless, + # the logical thing to do is to pass it through as an + # unrecognized entity reference. + # + # Except: when the input is "&carol;" this function + # will be called with input "carol". When the input is + # "AT&T", this function will be called with input + # "T". We have no way of knowing whether a semicolon + # was present originally, so we don't know whether + # this is an unknown entity or just a misplaced + # ampersand. + # + # The more common case is a misplaced ampersand, so I + # escape the ampersand and omit the trailing semicolon. + data = "&%s" % ref + if not data: + # This case is different from the one above, because we + # haven't already gone through a supposedly comprehensive + # mapping of entities to Unicode characters. We might not + # have gone through any mapping at all. So the chances are + # very high that this is a real entity, and not a + # misplaced ampersand. + data = "&%s;" % ref + self.handle_data(data) + + def handle_decl(self, data): + "Handle DOCTYPEs and the like as Declaration objects." + self._toStringSubclass(data, Declaration) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = HTMLParser.parse_declaration(self, i) + except HTMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + + +class BeautifulStoneSoup(Tag): + """This class contains the basic parser and search code. It defines a parser that knows nothing about tag behavior except for the following: @@ -982,6 +1139,7 @@ NESTABLE_TAGS = {} RESET_NESTING_TAGS = {} QUOTE_TAGS = {} + PRESERVE_WHITESPACE_TAGS = [] MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), lambda x: x.group(1) + ' />'), @@ -1005,14 +1163,15 @@ def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, - convertEntities=None, selfClosingTags=None): + convertEntities=None, selfClosingTags=None, isHTML=False, + builder=HTMLParserBuilder): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser. - sgmllib will process most bad HTML, and the BeautifulSoup + HTMLParser will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills - sgmllib, but Beautiful Soup can nonetheless choke or lose data + HTMLParser, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly. @@ -1022,7 +1181,7 @@ you'll get better performance. The default parser massage techniques fix the two most common - instances of invalid HTML that choke sgmllib: + instances of invalid HTML that choke HTMLParser: <br/> (No space between name of closing tag and tag close) <! --Comment--> (Extraneous whitespace in declaration) @@ -1060,29 +1219,21 @@ self.escapeUnrecognizedEntities = False self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) - SGMLParser.__init__(self) + self.builder = builder(self) + self.reset() if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() self.markup = markup self.markupMassage = markupMassage try: - self._feed() + self._feed(isHTML=isHTML) except StopParsing: pass - self.markup = None # The markup can now be GCed + self.markup = None # The markup can now be GCed. + self.builder = None # So can the builder. - def convert_charref(self, name): - """This method fixes a bug in Python's SGMLParser.""" - try: - n = int(name) - except ValueError: - return - if not 0 <= n <= 127 : # ASCII ends at 127, not 255 - return - return self.convert_codepoint(n) - - def _feed(self, inDocumentEncoding=None): + def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): @@ -1091,9 +1242,10 @@ else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.smartQuotesTo) + smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding + self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not isList(self.markupMassage): @@ -1106,27 +1258,14 @@ # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) - self.reset() + self.builder.reset() - SGMLParser.feed(self, markup) + self.builder.feed(markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() - def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser - superclass or the Tag superclass, depending on the method name.""" - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) - - if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ - or methodName.find('do_') == 0: - return SGMLParser.__getattr__(self, methodName) - elif methodName.find('__') != 0: - return Tag.__getattr__(self, methodName) - else: - raise AttributeError - def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" @@ -1136,7 +1275,7 @@ def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 - SGMLParser.reset(self) + self.builder.reset() self.currentData = [] self.currentTag = None self.tagStack = [] @@ -1166,8 +1305,10 @@ def endData(self, containerClass=NavigableString): if self.currentData: - currentData = ''.join(self.currentData) - if not currentData.translate(self.STRIP_ASCII_SPACES): + currentData = u''.join(self.currentData) + if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + not set([tag.name for tag in self.tagStack]).intersection( + self.PRESERVE_WHITESPACE_TAGS)): if '\n' in currentData: currentData = '\n' else: @@ -1299,100 +1440,10 @@ def handle_data(self, data): self.currentData.append(data) - def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.endData() - self.handle_data(text) - self.endData(subclass) + def extractCharsetFromMeta(self, attrs): + self.unknown_starttag('meta', attrs) - def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" - if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" - self._toStringSubclass(text, ProcessingInstruction) - def handle_comment(self, text): - "Handle comments as Comment objects." - self._toStringSubclass(text, Comment) - - def handle_charref(self, ref): - "Handle character references as data." - if self.convertEntities: - data = unichr(int(ref)) - else: - data = '&#%s;' % ref - self.handle_data(data) - - def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML and/or XML entity references to the corresponding Unicode - characters.""" - data = None - if self.convertHTMLEntities: - try: - data = unichr(name2codepoint[ref]) - except KeyError: - pass - - if not data and self.convertXMLEntities: - data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) - - if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref - if not data: - # This case is different from the one above, because we - # haven't already gone through a supposedly comprehensive - # mapping of entities to Unicode characters. We might not - # have gone through any mapping at all. So the chances are - # very high that this is a real entity, and not a - # misplaced ampersand. - data = "&%s;" % ref - self.handle_data(data) - - def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." - self._toStringSubclass(data, Declaration) - - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i+9] == '<![CDATA[': - k = self.rawdata.find(']]>', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) - else: - try: - j = SGMLParser.parse_declaration(self, i) - except SGMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - class BeautifulSoup(BeautifulStoneSoup): """This parser knows the following facts about HTML: @@ -1444,12 +1495,15 @@ def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES + kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs) SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) + PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + QUOTE_TAGS = {'script' : None, 'textarea' : None} #According to the HTML standard, each of these inline tags can @@ -1494,9 +1548,9 @@ NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) # Used to detect the charset in a META tag; see start_meta - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)") + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - def start_meta(self, attrs): + def extractCharsetFromMeta(self, attrs): """Beautiful Soup can detect a charset included in a META tag, try to convert the document to that charset, and re-parse the document from the beginning.""" @@ -1517,29 +1571,33 @@ if httpEquiv and contentType: # It's an interesting meta tag. match = self.CHARSET_RE.search(contentType) if match: - if getattr(self, 'declaredHTMLEncoding') or \ - (self.originalEncoding == self.fromEncoding): - # This is our second pass through the document, or - # else an encoding was specified explicitly and it - # worked. Rewrite the meta tag. - newAttr = self.CHARSET_RE.sub\ - (lambda(match):match.group(1) + - "%SOUP-ENCODING%", contentType) + if (self.declaredHTMLEncoding is not None or + self.originalEncoding == self.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + newAttr = self.CHARSET_RE.sub(rewrite, contentType) attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], newAttr) tagNeedsEncodingSubstitution = True else: # This is our first pass through the document. - # Go through it again with the new information. + # Go through it again with the encoding information. newCharset = match.group(3) if newCharset and newCharset != self.originalEncoding: self.declaredHTMLEncoding = newCharset self._feed(self.declaredHTMLEncoding) raise StopParsing + pass tag = self.unknown_starttag("meta", attrs) if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True + class StopParsing(Exception): pass @@ -1687,9 +1745,10 @@ "x-sjis" : "shift-jis" } def __init__(self, markup, overrideEncodings=[], - smartQuotesTo='xml'): + smartQuotesTo='xml', isHTML=False): + self.declaredHTMLEncoding = None self.markup, documentEncoding, sniffedEncoding = \ - self._detectEncoding(markup) + self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): @@ -1715,18 +1774,22 @@ for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) if u: break + self.unicode = u if not u: self.originalEncoding = None - def _subMSChar(self, orig): + def _subMSChar(self, match): """Changes a MS smart quote character to an XML or HTML entity.""" + orig = match.group(1) sub = self.MS_CHARS.get(orig) if type(sub) == types.TupleType: if self.smartQuotesTo == 'xml': - sub = '&#x%s;' % sub[1] + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: - sub = '&%s;' % sub[0] + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() return sub def _convertFrom(self, proposed): @@ -1741,9 +1804,9 @@ if self.smartQuotesTo and proposed.lower() in("windows-1252", "iso-8859-1", "iso-8859-2"): - markup = re.compile("([\x80-\x9f])").sub \ - (lambda(x): self._subMSChar(x.group(1)), - markup) + smart_quotes_re = "([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._subMSChar, markup) try: # print "Trying to convert document to %s" % proposed @@ -1782,7 +1845,7 @@ newdata = unicode(data, encoding) return newdata - def _detectEncoding(self, xml_data): + def _detectEncoding(self, xml_data, isHTML=False): """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: @@ -1830,13 +1893,19 @@ else: sniffed_xml_encoding = 'ascii' pass - xml_encoding_match = re.compile \ - ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\ - .match(xml_data) except: xml_encoding_match = None - if xml_encoding_match: - xml_encoding = xml_encoding_match.groups()[0].lower() + xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() + xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) + if not xml_encoding_match and isHTML: + meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() + regexp = re.compile(meta_re, re.I) + xml_encoding_match = regexp.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].decode( + 'ascii').lower() + if isHTML: + self.declaredHTMLEncoding = xml_encoding if sniffed_xml_encoding and \ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', @@ -1927,5 +1996,5 @@ #By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys - soup = BeautifulSoup(sys.stdin.read()) + soup = BeautifulSoup(sys.stdin) print soup.prettify() Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2009-05-08 07:52:49 UTC (rev 6857) +++ trunk/pywikipedia/wikipedia.py 2009-05-08 15:23:29 UTC (rev 6858) @@ -4995,7 +4995,7 @@ else: tree = BeautifulStoneSoup(xml) self._mediawiki_messages = _dict([(tag.get('name').lower(), html2unicode(tag.string)) - for tag in tree.findAll('message')]) + for tag in tree.findAll('message') if tag.string]) if not self._mediawiki_messages: # No messages could be added.

1 0

SVN: [6857] trunk/pywikipedia/BeautifulSoup.py
by multichill＠svn.wikimedia.org 08 May '09

08 May '09

Revision: 6857 Author: multichill Date: 2009-05-08 07:52:49 +0000 (Fri, 08 May 2009) Log Message: ----------- Reverting changes in r6842 (update BeautifulSoup to 3.1.0.1). This seems to make the bot unstable, see [ pywikipediabot-Bugs-2788812 ] TypeError: unsubscriptable object Modified Paths: -------------- trunk/pywikipedia/BeautifulSoup.py Modified: trunk/pywikipedia/BeautifulSoup.py =================================================================== --- trunk/pywikipedia/BeautifulSoup.py 2009-05-07 19:03:06 UTC (rev 6856) +++ trunk/pywikipedia/BeautifulSoup.py 2009-05-08 07:52:49 UTC (rev 6857) @@ -42,7 +42,7 @@ Here, have some legalese: -Copyright (c) 2004-2009, Leonard Richardson +Copyright (c) 2004-2007, Leonard Richardson All rights reserved. @@ -79,38 +79,27 @@ from __future__ import generators __author__ = "Leonard Richardson (leonardr(a)segfault.org)" -__version__ = "3.1.0.1" -__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" +__version__ = "3.0.6" +__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson" __license__ = "New-style BSD" +from sgmllib import SGMLParser, SGMLParseError import codecs -import markupbase import types import re -from HTMLParser import HTMLParser, HTMLParseError +import sgmllib try: - from htmlentitydefs import name2codepoint + from htmlentitydefs import name2codepoint except ImportError: - name2codepoint = {} -try: - set -except NameError: - from sets import Set as set + name2codepoint = {} -#These hacks make Beautiful Soup able to parse XML with namespaces -markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match +#This hack makes Beautiful Soup able to parse XML with namespaces +sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') DEFAULT_OUTPUT_ENCODING = "utf-8" # First, the classes that represent markup elements. -def sob(unicode, encoding): - """Returns either the given Unicode string or its encoding.""" - if encoding is None: - return unicode - else: - return unicode.encode(encoding) - class PageElement: """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -402,20 +391,8 @@ class NavigableString(unicode, PageElement): - def __new__(cls, value): - """Create a new NavigableString. - - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. - """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) - def __getnewargs__(self): - return (unicode(self),) + return (NavigableString.__str__(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -426,32 +403,34 @@ else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.decode().encode(encoding) + def __unicode__(self): + return str(self).decode(DEFAULT_OUTPUT_ENCODING) - def decodeGivenEventualEncoding(self, eventualEncoding): - return self + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + if encoding: + return self.encode(encoding) + else: + return self class CData(NavigableString): - def decodeGivenEventualEncoding(self, eventualEncoding): - return u'<![CDATA[' + self + u']]>' + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) class ProcessingInstruction(NavigableString): - - def decodeGivenEventualEncoding(self, eventualEncoding): + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): output = self - if u'%SOUP-ENCODING%' in output: - output = self.substituteEncoding(output, eventualEncoding) - return u'<?' + output + u'?>' + if "%SOUP-ENCODING%" in output: + output = self.substituteEncoding(output, encoding) + return "<?%s?>" % self.toEncoding(output, encoding) class Comment(NavigableString): - def decodeGivenEventualEncoding(self, eventualEncoding): - return u'' + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + return "" % NavigableString.__str__(self, encoding) class Declaration(NavigableString): - def decodeGivenEventualEncoding(self, eventualEncoding): - return u'<!' + self + u'>' + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + return "<!%s>" % NavigableString.__str__(self, encoding) class Tag(PageElement): @@ -517,13 +496,11 @@ self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities - def convert(kval): - "Converts HTML, XML and numeric entities in the attribute value." - k, val = kval - if val is None: - return kval - return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", - self._convertEntities, val)) + # Convert any HTML, XML, or numeric entities in the attribute values. + convert = lambda(k, val): (k, + re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", + self._convertEntities, + val)) self.attrs = map(convert, self.attrs) def get(self, key, default=None): @@ -614,8 +591,11 @@ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" - return self.decode(eventualEncoding=encoding) + return self.__str__(encoding) + def __unicode__(self): + return self.__str__(None) + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") @@ -625,30 +605,24 @@ appropriate XML entity for an XML special character.""" return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" - def __unicode__(self): - return self.decode() + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + """Returns a string or Unicode representation of this tag and + its contents. To get Unicode, pass None for encoding. - def __str__(self): - return self.encode() + NOTE: since Python's HTML parser consumes whitespace, this + method is not certain to reproduce the whitespace present in + the original string.""" - def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) + encodedName = self.toEncoding(self.name, encoding) - def decode(self, prettyPrint=False, indentLevel=0, - eventualEncoding=DEFAULT_OUTPUT_ENCODING): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding.""" - attrs = [] if self.attrs: for key, val in self.attrs: fmt = '%s="%s"' if isString(val): - if (self.containsSubstitutions - and eventualEncoding is not None - and '%SOUP-ENCODING%' in val): - val = self.substituteEncoding(val, eventualEncoding) + if self.containsSubstitutions and '%SOUP-ENCODING%' in val: + val = self.substituteEncoding(val, encoding) # The attribute value either: # @@ -677,26 +651,22 @@ # ampersands that aren't part of entities. We need # to escape those to XML entities too. val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) - if val is None: - # Handle boolean attributes. - decoded = key - else: - decoded = fmt % (key, val) - attrs.append(decoded) + + attrs.append(fmt % (self.toEncoding(key, encoding), + self.toEncoding(val, encoding))) close = '' closeTag = '' if self.isSelfClosing: close = ' /' else: - closeTag = '</%s>' % self.name + closeTag = '</%s>' % encodedName indentTag, indentContents = 0, 0 if prettyPrint: indentTag = indentLevel space = (' ' * (indentTag-1)) indentContents = indentTag + 1 - contents = self.decodeContents(prettyPrint, indentContents, - eventualEncoding) + contents = self.renderContents(encoding, prettyPrint, indentContents) if self.hidden: s = contents else: @@ -706,7 +676,7 @@ attributeString = ' ' + ' '.join(attrs) if prettyPrint: s.append(space) - s.append('<%s%s%s>' % (self.name, attributeString, close)) + s.append('<%s%s%s>' % (encodedName, attributeString, close)) if prettyPrint: s.append("\n") s.append(contents) @@ -731,23 +701,19 @@ self.extract() def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.encode(encoding, True) + return self.__str__(encoding, True) - def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): - return self.decodeContents(prettyPrint, indentLevel).encode(encoding) - - def decodeContents(self, prettyPrint=False, indentLevel=0, - eventualEncoding=DEFAULT_OUTPUT_ENCODING): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] for c in self: text = None if isinstance(c, NavigableString): - text = c.decodeGivenEventualEncoding(eventualEncoding) + text = c.__str__(encoding) elif isinstance(c, Tag): - s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) + s.append(c.__str__(encoding, prettyPrint, indentLevel)) if text and prettyPrint: text = text.strip() if text: @@ -788,7 +754,7 @@ return self._findAll(name, attrs, text, limit, generator, **kwargs) findChildren = findAll - # Pre-3.x compatibility methods. Will go away in 4.0. + # Pre-3.x compatibility methods first = find fetch = findAll @@ -798,15 +764,6 @@ def firstText(self, text=None, recursive=True): return self.find(text=text, recursive=recursive) - # 3.x compatibility methods. Will go away in 4.0. - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - if encoding is None: - return self.decodeContents(prettyPrint, indentLevel, encoding) - else: - return self.encodeContents(encoding, prettyPrint, indentLevel) - - #Private methods def _getAttrMap(self): @@ -819,24 +776,26 @@ return self.attrMap #Generator methods - def recursiveChildGenerator(self): - if not len(self.contents): - raise StopIteration - stopNode = self._lastRecursiveChild().next - current = self.contents[0] - while current is not stopNode: - yield current - current = current.next - def childGenerator(self): - if not len(self.contents): - raise StopIteration - current = self.contents[0] - while current: - yield current - current = current.nextSibling + for i in range(0, len(self.contents)): + yield self.contents[i] raise StopIteration + def recursiveChildGenerator(self): + stack = [(self, 0)] + while stack: + tag, start = stack.pop() + if isinstance(tag, Tag): + for i in range(start, len(tag.contents)): + a = tag.contents[i] + yield a + if isinstance(a, Tag) and tag.contents: + if i < len(tag.contents) - 1: + stack.append((tag, i+1)) + stack.append((a, 0)) + break + raise StopIteration + # Next, a couple classes to represent queries and their results. class SoupStrainer: """Encapsulates a number of ways of matching a markup element (tag or @@ -937,14 +896,13 @@ #other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name - if markup is not None and not isString(markup): + if markup and not isString(markup): markup = unicode(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. result = markup and matchAgainst.search(markup) - elif (isList(matchAgainst) - and (markup is not None or not isString(matchAgainst))): + elif isList(matchAgainst): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): result = markup.has_key(matchAgainst) @@ -970,8 +928,8 @@ def isList(l): """Convenience method that works with all 2.x versions of Python to determine whether or not something is listlike.""" - return ((hasattr(l, '__iter__') and not isString(l)) - or (type(l) in (types.ListType, types.TupleType))) + return hasattr(l, '__iter__') \ + or (type(l) in (types.ListType, types.TupleType)) def isString(s): """Convenience method that works with all 2.x versions of Python @@ -991,7 +949,7 @@ #It's a map. Merge it. for k,v in portion.items(): built[k] = v - elif isList(portion) and not isString(portion): + elif isList(portion): #It's a list. Map each item to the default. for k in portion: built[k] = default @@ -1002,123 +960,8 @@ # Now, the parser classes. -class HTMLParserBuilder(HTMLParser): +class BeautifulStoneSoup(Tag, SGMLParser): - def __init__(self, soup): - HTMLParser.__init__(self) - self.soup = soup - - # We inherit feed() and reset(). - - def handle_starttag(self, name, attrs): - if name == 'meta': - self.soup.extractCharsetFromMeta(attrs) - else: - self.soup.unknown_starttag(name, attrs) - - def handle_endtag(self, name): - self.soup.unknown_endtag(name) - - def handle_data(self, content): - self.soup.handle_data(content) - - def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.soup.endData() - self.handle_data(text) - self.soup.endData(subclass) - - def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" - if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" - self._toStringSubclass(text, ProcessingInstruction) - - def handle_comment(self, text): - "Handle comments as Comment objects." - self._toStringSubclass(text, Comment) - - def handle_charref(self, ref): - "Handle character references as data." - if self.soup.convertEntities: - data = unichr(int(ref)) - else: - data = '&#%s;' % ref - self.handle_data(data) - - def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML and/or XML entity references to the corresponding Unicode - characters.""" - data = None - if self.soup.convertHTMLEntities: - try: - data = unichr(name2codepoint[ref]) - except KeyError: - pass - - if not data and self.soup.convertXMLEntities: - data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) - - if not data and self.soup.convertHTMLEntities and \ - not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref - if not data: - # This case is different from the one above, because we - # haven't already gone through a supposedly comprehensive - # mapping of entities to Unicode characters. We might not - # have gone through any mapping at all. So the chances are - # very high that this is a real entity, and not a - # misplaced ampersand. - data = "&%s;" % ref - self.handle_data(data) - - def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." - self._toStringSubclass(data, Declaration) - - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i+9] == '<![CDATA[': - k = self.rawdata.find(']]>', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) - else: - try: - j = HTMLParser.parse_declaration(self, i) - except HTMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - - -class BeautifulStoneSoup(Tag): - """This class contains the basic parser and search code. It defines a parser that knows nothing about tag behavior except for the following: @@ -1139,7 +982,6 @@ NESTABLE_TAGS = {} RESET_NESTING_TAGS = {} QUOTE_TAGS = {} - PRESERVE_WHITESPACE_TAGS = [] MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), lambda x: x.group(1) + ' />'), @@ -1163,15 +1005,14 @@ def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, - convertEntities=None, selfClosingTags=None, isHTML=False, - builder=HTMLParserBuilder): + convertEntities=None, selfClosingTags=None): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser. - HTMLParser will process most bad HTML, and the BeautifulSoup + sgmllib will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills - HTMLParser, but Beautiful Soup can nonetheless choke or lose data + sgmllib, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly. @@ -1181,7 +1022,7 @@ you'll get better performance. The default parser massage techniques fix the two most common - instances of invalid HTML that choke HTMLParser: + instances of invalid HTML that choke sgmllib: <br/> (No space between name of closing tag and tag close) <! --Comment--> (Extraneous whitespace in declaration) @@ -1219,21 +1060,29 @@ self.escapeUnrecognizedEntities = False self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) - self.builder = builder(self) - self.reset() + SGMLParser.__init__(self) if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() self.markup = markup self.markupMassage = markupMassage try: - self._feed(isHTML=isHTML) + self._feed() except StopParsing: pass - self.markup = None # The markup can now be GCed. - self.builder = None # So can the builder. + self.markup = None # The markup can now be GCed - def _feed(self, inDocumentEncoding=None, isHTML=False): + def convert_charref(self, name): + """This method fixes a bug in Python's SGMLParser.""" + try: + n = int(name) + except ValueError: + return + if not 0 <= n <= 127 : # ASCII ends at 127, not 255 + return + return self.convert_codepoint(n) + + def _feed(self, inDocumentEncoding=None): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): @@ -1242,10 +1091,9 @@ else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) + smartQuotesTo=self.smartQuotesTo) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding - self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not isList(self.markupMassage): @@ -1258,14 +1106,27 @@ # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) - self.builder.reset() + self.reset() - self.builder.feed(markup) + SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() + def __getattr__(self, methodName): + """This method routes method call requests to either the SGMLParser + superclass or the Tag superclass, depending on the method name.""" + #print "__getattr__ called on %s.%s" % (self.__class__, methodName) + + if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ + or methodName.find('do_') == 0: + return SGMLParser.__getattr__(self, methodName) + elif methodName.find('__') != 0: + return Tag.__getattr__(self, methodName) + else: + raise AttributeError + def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" @@ -1275,7 +1136,7 @@ def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 - self.builder.reset() + SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] @@ -1305,10 +1166,8 @@ def endData(self, containerClass=NavigableString): if self.currentData: - currentData = u''.join(self.currentData) - if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not set([tag.name for tag in self.tagStack]).intersection( - self.PRESERVE_WHITESPACE_TAGS)): + currentData = ''.join(self.currentData) + if not currentData.translate(self.STRIP_ASCII_SPACES): if '\n' in currentData: currentData = '\n' else: @@ -1440,10 +1299,100 @@ def handle_data(self, data): self.currentData.append(data) - def extractCharsetFromMeta(self, attrs): - self.unknown_starttag('meta', attrs) + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.endData() + self.handle_data(text) + self.endData(subclass) + def handle_pi(self, text): + """Handle a processing instruction as a ProcessingInstruction + object, possibly one with a %SOUP-ENCODING% slot into which an + encoding will be plugged later.""" + if text[:3] == "xml": + text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + self._toStringSubclass(text, ProcessingInstruction) + def handle_comment(self, text): + "Handle comments as Comment objects." + self._toStringSubclass(text, Comment) + + def handle_charref(self, ref): + "Handle character references as data." + if self.convertEntities: + data = unichr(int(ref)) + else: + data = '&#%s;' % ref + self.handle_data(data) + + def handle_entityref(self, ref): + """Handle entity references as data, possibly converting known + HTML and/or XML entity references to the corresponding Unicode + characters.""" + data = None + if self.convertHTMLEntities: + try: + data = unichr(name2codepoint[ref]) + except KeyError: + pass + + if not data and self.convertXMLEntities: + data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + + if not data and self.convertHTMLEntities and \ + not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # TODO: We've got a problem here. We're told this is + # an entity reference, but it's not an XML entity + # reference or an HTML entity reference. Nonetheless, + # the logical thing to do is to pass it through as an + # unrecognized entity reference. + # + # Except: when the input is "&carol;" this function + # will be called with input "carol". When the input is + # "AT&T", this function will be called with input + # "T". We have no way of knowing whether a semicolon + # was present originally, so we don't know whether + # this is an unknown entity or just a misplaced + # ampersand. + # + # The more common case is a misplaced ampersand, so I + # escape the ampersand and omit the trailing semicolon. + data = "&%s" % ref + if not data: + # This case is different from the one above, because we + # haven't already gone through a supposedly comprehensive + # mapping of entities to Unicode characters. We might not + # have gone through any mapping at all. So the chances are + # very high that this is a real entity, and not a + # misplaced ampersand. + data = "&%s;" % ref + self.handle_data(data) + + def handle_decl(self, data): + "Handle DOCTYPEs and the like as Declaration objects." + self._toStringSubclass(data, Declaration) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + class BeautifulSoup(BeautifulStoneSoup): """This parser knows the following facts about HTML: @@ -1495,15 +1444,12 @@ def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES - kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs) SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) - PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) - QUOTE_TAGS = {'script' : None, 'textarea' : None} #According to the HTML standard, each of these inline tags can @@ -1548,9 +1494,9 @@ NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) # Used to detect the charset in a META tag; see start_meta - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)") - def extractCharsetFromMeta(self, attrs): + def start_meta(self, attrs): """Beautiful Soup can detect a charset included in a META tag, try to convert the document to that charset, and re-parse the document from the beginning.""" @@ -1571,33 +1517,29 @@ if httpEquiv and contentType: # It's an interesting meta tag. match = self.CHARSET_RE.search(contentType) if match: - if (self.declaredHTMLEncoding is not None or - self.originalEncoding == self.fromEncoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - newAttr = self.CHARSET_RE.sub(rewrite, contentType) + if getattr(self, 'declaredHTMLEncoding') or \ + (self.originalEncoding == self.fromEncoding): + # This is our second pass through the document, or + # else an encoding was specified explicitly and it + # worked. Rewrite the meta tag. + newAttr = self.CHARSET_RE.sub\ + (lambda(match):match.group(1) + + "%SOUP-ENCODING%", contentType) attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], newAttr) tagNeedsEncodingSubstitution = True else: # This is our first pass through the document. - # Go through it again with the encoding information. + # Go through it again with the new information. newCharset = match.group(3) if newCharset and newCharset != self.originalEncoding: self.declaredHTMLEncoding = newCharset self._feed(self.declaredHTMLEncoding) raise StopParsing - pass tag = self.unknown_starttag("meta", attrs) if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True - class StopParsing(Exception): pass @@ -1745,10 +1687,9 @@ "x-sjis" : "shift-jis" } def __init__(self, markup, overrideEncodings=[], - smartQuotesTo='xml', isHTML=False): - self.declaredHTMLEncoding = None + smartQuotesTo='xml'): self.markup, documentEncoding, sniffedEncoding = \ - self._detectEncoding(markup, isHTML) + self._detectEncoding(markup) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): @@ -1774,22 +1715,18 @@ for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) if u: break - self.unicode = u if not u: self.originalEncoding = None - def _subMSChar(self, match): + def _subMSChar(self, orig): """Changes a MS smart quote character to an XML or HTML entity.""" - orig = match.group(1) sub = self.MS_CHARS.get(orig) if type(sub) == types.TupleType: if self.smartQuotesTo == 'xml': - sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + sub = '&#x%s;' % sub[1] else: - sub = '&'.encode() + sub[0].encode() + ';'.encode() - else: - sub = sub.encode() + sub = '&%s;' % sub[0] return sub def _convertFrom(self, proposed): @@ -1804,9 +1741,9 @@ if self.smartQuotesTo and proposed.lower() in("windows-1252", "iso-8859-1", "iso-8859-2"): - smart_quotes_re = "([\x80-\x9f])" - smart_quotes_compiled = re.compile(smart_quotes_re) - markup = smart_quotes_compiled.sub(self._subMSChar, markup) + markup = re.compile("([\x80-\x9f])").sub \ + (lambda(x): self._subMSChar(x.group(1)), + markup) try: # print "Trying to convert document to %s" % proposed @@ -1845,7 +1782,7 @@ newdata = unicode(data, encoding) return newdata - def _detectEncoding(self, xml_data, isHTML=False): + def _detectEncoding(self, xml_data): """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: @@ -1893,19 +1830,13 @@ else: sniffed_xml_encoding = 'ascii' pass + xml_encoding_match = re.compile \ + ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\ + .match(xml_data) except: xml_encoding_match = None - xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() - xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) - if not xml_encoding_match and isHTML: - meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() - regexp = re.compile(meta_re, re.I) - xml_encoding_match = regexp.search(xml_data) - if xml_encoding_match is not None: - xml_encoding = xml_encoding_match.groups()[0].decode( - 'ascii').lower() - if isHTML: - self.declaredHTMLEncoding = xml_encoding + if xml_encoding_match: + xml_encoding = xml_encoding_match.groups()[0].lower() if sniffed_xml_encoding and \ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', @@ -1996,5 +1927,5 @@ #By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys - soup = BeautifulSoup(sys.stdin) + soup = BeautifulSoup(sys.stdin.read()) print soup.prettify()

1 0

SVN: [6856] branches/rewrite/pywikibot
by russblau＠svn.wikimedia.org 07 May '09

07 May '09

Revision: 6856 Author: russblau Date: 2009-05-07 19:03:06 +0000 (Thu, 07 May 2009) Log Message: ----------- Implement SSL connections (somewhat different implementation than used in trunk); default is to use SSL (if available) for login actions only, but user can configure bot to use SSL never or always. Modified Paths: -------------- branches/rewrite/pywikibot/comms/http.py branches/rewrite/pywikibot/config2.py branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/family.py Property Changed: ---------------- branches/rewrite/pywikibot/config2.py branches/rewrite/pywikibot/family.py Modified: branches/rewrite/pywikibot/comms/http.py =================================================================== --- branches/rewrite/pywikibot/comms/http.py 2009-05-07 18:27:55 UTC (rev 6855) +++ branches/rewrite/pywikibot/comms/http.py 2009-05-07 19:03:06 UTC (rev 6856) @@ -76,25 +76,32 @@ atexit.register(_flush) # export cookie_jar to global namespace -import pywikibot pywikibot.cookie_jar = cookie_jar -def request(site, uri, *args, **kwargs): +def request(site, uri, ssl=False, *args, **kwargs): """Queue a request to be submitted to Site. All parameters not listed below are the same as L{httplib2.Http.request}, but the uri is relative @param site: The Site to connect to + @param uri: the URI to retrieve (relative to the site's scriptpath) + @param ssl: Use https connection @return: The received data (a unicode string). + """ - baseuri = "%s://%s/" % (site.protocol(), site.hostname()) - uri = urlparse.urljoin(baseuri, uri) + if ssl: + proto = "https" + host = site.ssl_hostname() + else: + proto = "http" + host = site.hostname() + full_uri = "%(proto)s://%(host)s%(uri)s" % locals() # set default user-agent string kwargs.setdefault("headers", {}) kwargs["headers"].setdefault("user-agent", useragent) - request = threadedhttp.HttpRequest(uri, *args, **kwargs) + request = threadedhttp.HttpRequest(full_uri, *args, **kwargs) http_queue.put(request) request.lock.acquire() Modified: branches/rewrite/pywikibot/config2.py =================================================================== --- branches/rewrite/pywikibot/config2.py 2009-05-07 18:27:55 UTC (rev 6855) +++ branches/rewrite/pywikibot/config2.py 2009-05-07 19:03:06 UTC (rev 6856) @@ -50,7 +50,7 @@ gdab_namespaces = {} # Solve captchas in the webbrowser. Setting this to False will result in the -# exception CaptchaError be thrown if a captcha is encountered. +# exception CaptchaError being thrown if a captcha is encountered. solve_captcha = True # Some sites will require password identication to access the HTML pages at @@ -67,6 +67,19 @@ # 2. You must use the hostname of the site, not its family/language pair authenticate = {} +# +# Security Connection for Wikimedia Projects +# +use_SSL_onlogin = True # if available, use SSL when logging in +use_SSL_always = False # if available, use SSL for all API queries + +# Available security projects +available_ssl_project = [ + u'wikipedia', u'wikinews', u'wikisource', u'wiktionary', u'wikibooks', + u'wikiquote', u'wikiversity', u'meta', u'mediawiki', u'commons', + u'species', u'incubator' +] + # password_file = ".passwd" # A password file with default passwords. For more information, please # see LoginManager.readPassword in login.py. @@ -150,7 +163,10 @@ #we get "StdioOnnaStick instance has no attribute 'encoding'" console_encoding = None -# The encoding in which textfiles are stored, which contain lists of page titles. +# The encoding in which textfiles are stored, which contain lists of page +# titles. The most used is: 'utf-8'. 'utf-8-sig' recognizes BOM but it is +# available on Python 2.5 or higher. For a complete list please see: +# http://docs.python.org/library/codecs.html#standard-encodings textfile_encoding = 'utf-8' # tkinter isn't yet ready @@ -163,7 +179,7 @@ # Currently only works if interface 'terminal' is set. transliterate = True -# Should the system bell be ringed if the bot expects user input? +# Should the system bell ring if the bot expects user input? ring_bell = False # Colorization can be used to markup important text parts of the output. @@ -220,6 +236,7 @@ # log = [] # Per default, logging of interwiki.py is enabled because its logfiles can # be used to generate so-called warnfiles. +# This setting can be overridden by the -log or -nolog command-line arguments. log = ['interwiki'] # filename defaults to modulename-bot.log logfilename = None @@ -337,7 +354,7 @@ # That can do very ugly results. deIndentTables = True # table2wiki.py works quite stable, so you might switch to True -table2wikiAskOnlyWarnngs = True +table2wikiAskOnlyWarnings = True table2wikiSkipWarnings = False ############## WEBLINK CHECKER SETTINGS ############## Property changes on: branches/rewrite/pywikibot/config2.py ___________________________________________________________________ Modified: svn:mergeinfo - /trunk/pywikipedia/config.py:6734-6801 + /trunk/pywikipedia/config.py:6734-6801,6810-6854 Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2009-05-07 18:27:55 UTC (rev 6855) +++ branches/rewrite/pywikibot/data/api.py 2009-05-07 19:03:06 UTC (rev 6856) @@ -194,7 +194,13 @@ self.site.throttle(write=write) uri = self.site.scriptpath() + "/api.php" try: - rawdata = http.request(self.site, uri, method="POST", + ssl = False + if self.site.family.name in config.available_ssl_project: + if action == "login" and config.use_SSL_onlogin: + ssl = True + elif config.use_SSL_always: + ssl = True + rawdata = http.request(self.site, uri, ssl, method="POST", headers={'Content-Type': 'application/x-www-form-urlencoded'}, body=params) Modified: branches/rewrite/pywikibot/family.py =================================================================== --- branches/rewrite/pywikibot/family.py 2009-05-07 18:27:55 UTC (rev 6855) +++ branches/rewrite/pywikibot/family.py 2009-05-07 19:03:06 UTC (rev 6856) @@ -670,6 +670,7 @@ redirect = { 'af': [u'aanstuur'], 'als': [u'weiterleitung'], + 'an': [u'redirección'], 'ar': [u'تحويل'], 'arz': [u'تحويل'], 'av': [u'перенаправление'], @@ -677,6 +678,7 @@ 'be-tarask': [u'перанакіраваньне'], 'be-x-old': [u'перанакіраваньне'], 'bg': [u'виж', u'пренасочване'], + 'bug': [u'alih'], 'br': [u'adkas'], 'bs': [u'preusmjeri'], 'cs': [u'přesměruj'], @@ -723,17 +725,20 @@ 'oc': [u'redireccion'], 'pdc': [u'weiterleitung'], 'pl': [u'redirect', u'patrz', u'tam', u'przekieruj'], + 'qu': [u'redirección'], 'ro': [u'redirecteaza'], 'ru': [u'redirect', u'перенаправление', u'перенапр'], 'sa': [u'#पुनर्निदेशन'], + 'sah': [u'перенаправление'], 'si': [u'යළියොමුව'], 'sk': [u'presmeruj'], 'sq': [u'ridrejto'], 'sr': [u'преусмери', u'преусмери'], 'srn': [u'stir'], + 'stq': [u'weiterleitung'], 'su': [u'redirected', u'alih'], 'sv': [u'omdirigering'], - 'szl': [u'redirect', u'tam'], + 'szl': [u'redirect', u'tam', u'patrz'], 'ta': [u'வழிமாற்று'], 'te': [u'దారిమార్పు'], 'th': [u'เปลี่ยนทาง'], @@ -767,7 +772,7 @@ def pagenamecodes(self,code): pos = ['PAGENAME'] pos2 = [] - if code in self.pagename.keys(): + if code in self.pagename: pos = pos + self.pagename[code] elif code == 'als': return self.pagenamecodes('de') @@ -780,7 +785,7 @@ def pagename2codes(self,code): pos = ['PAGENAME'] pos2 = [] - if code in self.pagenamee.keys(): + if code in self.pagenamee: pos = pos + self.pagenamee[code] elif code == 'als': return self.pagename2codes('de') @@ -799,8 +804,13 @@ return 'http' def hostname(self, code): + """The hostname to use for standard http connections.""" return self.langs[code] + def ssl_hostname(self, code): + """The hostname to use for SSL connections.""" + return "secure.wikimedia.org/%s/%s" % (self.name, code) + def scriptpath(self, code): """The prefix used to locate scripts on this wiki. Property changes on: branches/rewrite/pywikibot/family.py ___________________________________________________________________ Modified: svn:mergeinfo - /trunk/pywikipedia/family.py:6057-6649 + /trunk/pywikipedia/family.py:6057-6854

1 0

SVN: [6855] branches/rewrite/pywikibot/data/api.py
by russblau＠svn.wikimedia.org 07 May '09

07 May '09

Revision: 6855 Author: russblau Date: 2009-05-07 18:27:55 +0000 (Thu, 07 May 2009) Log Message: ----------- Save title normalization information in QueryGenerator objects Modified Paths: -------------- branches/rewrite/pywikibot/data/api.py Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2009-05-07 13:07:14 UTC (rev 6854) +++ branches/rewrite/pywikibot/data/api.py 2009-05-07 18:27:55 UTC (rev 6855) @@ -474,16 +474,23 @@ % (self.__class__.__name__, self.resultkey)) logger.debug(unicode(self.data)) return - pagedata = self.data["query"][self.resultkey] - if isinstance(pagedata, dict): + resultdata = self.data["query"][self.resultkey] + if isinstance(resultdata, dict): logger.debug(u"%s received %s; limit=%s" - % (self.__class__.__name__, pagedata.keys(), + % (self.__class__.__name__, resultdata.keys(), self.limit)) - pagedata = [pagedata[k] for k in sorted(pagedata.keys())] + resultdata = [resultdata[k] for k in sorted(resultdata.keys())] else: logger.debug(u"%s received %s; limit=%s" - % (self.__class__.__name__, pagedata, self.limit)) - for item in pagedata: + % (self.__class__.__name__, resultdata, + self.limit)) + if "normalized" in self.data["query"]: + self.normalized = dict((item['to'], item['from']) + for item in + self.data["query"]["normalized"]) + else: + self.normalized = {} + for item in resultdata: yield self.result(item) count += 1 if self.limit is not None and self.limit > 0 \ @@ -625,6 +632,7 @@ """ QueryGenerator.__init__(self, list=listaction, **kwargs) + class LogEntryListGenerator(ListGenerator): """ Like ListGenerator, but specialized for listaction="logevents" : @@ -639,6 +647,7 @@ def result(self, pagedata): return self.entryFactory.create(pagedata) + class LoginManager(login.LoginManager): """Supplies getCookie() method to use API interface.""" def getCookie(self, remember=True, captchaId=None, captchaAnswer=None):

1 0

SVN: [6854] trunk/pywikipedia/config.py
by cosoleto＠svn.wikimedia.org 07 May '09

07 May '09

Revision: 6854 Author: cosoleto Date: 2009-05-07 13:07:14 +0000 (Thu, 07 May 2009) Log Message: ----------- fix typo Modified Paths: -------------- trunk/pywikipedia/config.py Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2009-05-07 13:03:59 UTC (rev 6853) +++ trunk/pywikipedia/config.py 2009-05-07 13:07:14 UTC (rev 6854) @@ -125,8 +125,8 @@ console_encoding = None # The encoding in which textfiles are stored, which contain lists of page -# titles. The most used is: 'utf-8', 'utf-8-sig' recognizes BOM but it is -# available on Python 2.5 or higher). For a complete list please see: +# titles. The most used is: 'utf-8'. 'utf-8-sig' recognizes BOM but it is +# available on Python 2.5 or higher. For a complete list please see: # http://docs.python.org/library/codecs.html#standard-encodings textfile_encoding = 'utf-8'

1 0

SVN: [6853] trunk/pywikipedia/config.py
by cosoleto＠svn.wikimedia.org 07 May '09

07 May '09

Revision: 6853 Author: cosoleto Date: 2009-05-07 13:03:59 +0000 (Thu, 07 May 2009) Log Message: ----------- config: added to textfile_encoding documentation mention of 'utf-8-sig' (variant of UTF-8 used on Microsoft Windows with BOM) and link to list of available codecs supported. Modified Paths: -------------- trunk/pywikipedia/config.py Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2009-05-07 09:54:28 UTC (rev 6852) +++ trunk/pywikipedia/config.py 2009-05-07 13:03:59 UTC (rev 6853) @@ -124,7 +124,10 @@ #we get "StdioOnnaStick instance has no attribute 'encoding'" console_encoding = None -# The encoding in which textfiles are stored, which contain lists of page titles. +# The encoding in which textfiles are stored, which contain lists of page +# titles. The most used is: 'utf-8', 'utf-8-sig' recognizes BOM but it is +# available on Python 2.5 or higher). For a complete list please see: +# http://docs.python.org/library/codecs.html#standard-encodings textfile_encoding = 'utf-8' # tkinter isn't yet ready

1 0

SVN: [6852] trunk/pywikipedia/featured.py
by siebrand＠svn.wikimedia.org 07 May '09

07 May '09

Revision: 6852 Author: siebrand Date: 2009-05-07 09:54:28 +0000 (Thu, 07 May 2009) Log Message: ----------- [ 2781435 ] Update for be-x-old localisation Modified Paths: -------------- trunk/pywikipedia/featured.py Modified: trunk/pywikipedia/featured.py =================================================================== --- trunk/pywikipedia/featured.py 2009-05-07 09:52:30 UTC (rev 6851) +++ trunk/pywikipedia/featured.py 2009-05-07 09:54:28 UTC (rev 6852) @@ -17,9 +17,9 @@ * -top : using -top if you want moving {{Link FA|lang}} to top of interwiki. DEFAULT: placing {{Link FA|lang}} right next to corresponding interwiki. -* -count : Only counts how many featured articles of an languages (using "-fromlang" argument) - or all wikipedias (using "-fromall" argument). (merge /archive/featuredcount.py) like: - featured.py -fromlang:en,he -count +* -count : Only counts how many featured articles of an languages (using "-fromlang" argument) + or all wikipedias (using "-fromall" argument). (merge /archive/featuredcount.py) like: + featured.py -fromlang:en,he -count (give counts how many featured articles of en and he wp) usage: featured.py [-interactive] [-nocache] [-top] [-after:zzzz] [-fromlang:xx,yy,zz|-fromall] @@ -51,6 +51,7 @@ 'als': u'Bötli: [[%s:%s]] isch en bsunders glungener Artikel', 'ar': u'بوت: [[%s:%s]] هي مقالة مختارة', 'bat-smg': u'robots: Pavīzdėnė straipsnė nūruoda [[%s:%s]]', + 'be-x-old': u'Робат: [[%s:%s]] — абраны артыкул', 'bs': u'Bot: Interwiki za izabrane članke za [[%s:%s]]', 'cs': u'Bot: Nejlepší článek: [[%s:%s]]', 'cy': u'Robot: Mae [[%s:%s]] yn erthygl ddethol',

1 0

Jump to page:

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

Pywikipedia-svn