Revision: 5233 Author: russblau Date: 2008-04-18 18:53:17 +0000 (Fri, 18 Apr 2008)
Log Message: ----------- use site.encoding instead of hard-coded utf-8, and try to encode before calling api.py whenever possible. Also add ThreadedGenerator class for later use.
Modified Paths: -------------- branches/rewrite/pywikibot/__init__.py branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/site.py
Added Paths: ----------- branches/rewrite/pywikibot/tools.py
Modified: branches/rewrite/pywikibot/__init__.py =================================================================== --- branches/rewrite/pywikibot/__init__.py 2008-04-18 14:47:28 UTC (rev 5232) +++ branches/rewrite/pywikibot/__init__.py 2008-04-18 18:53:17 UTC (rev 5233) @@ -62,18 +62,6 @@
from page import Page, ImagePage, Category
-##def Page(*args, **kwargs): -## from page import Page as _Page -## return _Page(*args, **kwargs) -## -##def ImagePage(*args, **kwargs): -## from page import ImagePage as _ImagePage -## return _ImagePage(*args, **kwargs) -## -##def Category(*args, **kwargs): -## from page import Category as _Category -## return _Category(*args, **kwargs) - # DEBUG
def output(text):
Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2008-04-18 14:47:28 UTC (rev 5232) +++ branches/rewrite/pywikibot/data/api.py 2008-04-18 18:53:17 UTC (rev 5233) @@ -142,8 +142,10 @@ raise TypeError("Query format '%s' cannot be parsed." % self.params['format']) uri = self.site.scriptpath() + "/api.php" - params = urllib.urlencode([(k, v.encode("utf8")) - for (k, v) in self.params.items()]) + for key in self.params: + if isinstance(self.params[key], unicode): + self.params[key] = self.params[key].encode(self.site.encoding()) + params = urllib.urlencode(self.params) while True: # TODO catch http errors try: @@ -330,6 +332,14 @@
""" p = pywikibot.Page(self.site, pagedata['title'], pagedata['ns']) + if "pageid" in pagedata: + self._pageid = int(pagedata['pageid']) + elif "missing" in pagedata: + self._pageid = 0 # Non-existent page + else: + raise AssertionError( + "Page %s has neither 'pageid' nor 'missing' attribute" + % pagedata['title']) if 'lastrevid' in pagedata: p._revid = pagedata['lastrevid'] if 'touched' in pagedata:
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2008-04-18 14:47:28 UTC (rev 5232) +++ branches/rewrite/pywikibot/site.py 2008-04-18 18:53:17 UTC (rev 5233) @@ -484,6 +484,17 @@ return self._namespaces[num] return self._namespaces[num][0]
+ def page_exists(self, page): + """Return True if and only if page is an existing page on site.""" + if not hasattr(page, "_pageid"): + query = api.PropertyGenerator( + "info", inprop="protection|talkid|subjectid", + titles=page.title(withSection=False + ).encode(self.encoding())) + for item in query(): + pass #FIXME + return page._pageid > 0 + # following group of methods map more-or-less directly to API queries
def getbacklinks(self, page, followRedirects=False, filterRedirects=None, @@ -500,7 +511,7 @@ in this list.
""" - bltitle = page.title(withSection=False) + bltitle = page.title(withSection=False).encode(self.encoding()) blgen = api.PageGenerator("backlinks", gbltitle=bltitle) if namespaces is not None: blgen.request["gblnamespace"] = u"|".join(unicode(ns) @@ -523,7 +534,7 @@ in this list.
""" - eititle = page.title(withSection=False) + eititle = page.title(withSection=False).encode(self.encoding()) eigen = api.PageGenerator("embeddedin", geititle=eititle) if namespaces is not None: eigen.request["geinamespace"] = u"|".join(unicode(ns) @@ -548,8 +559,12 @@
def getlinks(self, page, namespaces=None): """Iterate internal wikilinks contained (or transcluded) on page.""" - pltitle = page.title(withSection=False) - plgen = api.PageGenerator("links", titles=pltitle) + plgen = api.PageGenerator("links") + if hasattr(page, "_pageid"): + plgen.request['pageids'] = str(page._pageid) + else: + pltitle = page.title(withSection=False).encode(self.encoding()) + plgen.request['titles'] = pltitle if namespaces is not None: plgen.request["gplnamespace"] = u"|".join(unicode(ns) for ns in namespaces) @@ -557,20 +572,24 @@
def getcategories(self, page, withSortKey=False): """Iterate categories to which page belongs.""" - # Sortkey doesn't seem to work with generator; FIXME - cltitle = page.title(withSection=False) - clgen = api.CategoryPageGenerator("categories", titles=cltitle) + # Sortkey doesn't work with generator; FIXME or deprecate + clgen = api.CategoryPageGenerator("categories") + if hasattr(page, "_pageid"): + clgen.request['pageids'] = str(page._pageid) + else: + cltitle = page.title(withSection=False).encode(self.encoding()) + clgen.request['titles'] = cltitle return clgen
def getimages(self, page): """Iterate images used (not just linked) on the page.""" - imtitle = page.title(withSection=False) + imtitle = page.title(withSection=False).encode(self.encoding()) imgen = api.ImagePageGenerator("images", titles=imtitle) return imgen
def gettemplates(self, page, namespaces=None): """Iterate templates transcluded (not just linked) on the page.""" - tltitle = page.title(withSection=False) + tltitle = page.title(withSection=False).encode(self.encoding()) tlgen = api.PageGenerator("templates", titles=tltitle) if namespaces is not None: tlgen.request["gtlnamespace"] = u"|".join(unicode(ns) @@ -593,7 +612,7 @@ raise ValueError( "Cannot get category members of non-Category page '%s'" % category.title()) - cmtitle = category.title(withSection=False) + cmtitle = category.title(withSection=False).encode(self.encoding()) cmgen = api.PageGenerator(u"categorymembers", gcmtitle=cmtitle, gcmprop="ids|title|sortkey") if namespaces is not None: @@ -626,8 +645,8 @@ if page is None and revids is None: raise ValueError( "getrevisions needs either page or revids argument.") - if page is not None: - rvtitle = page.title(withSection=False) + if revids is None: + rvtitle = page.title(withSection=False).encode(self.encoding()) rvgen = api.PropertyGenerator(u"revisions", titles=rvtitle) else: ids = u"|".join(unicode(r) for r in revids)
Added: branches/rewrite/pywikibot/tools.py =================================================================== --- branches/rewrite/pywikibot/tools.py (rev 0) +++ branches/rewrite/pywikibot/tools.py 2008-04-18 18:53:17 UTC (rev 5233) @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +"""Miscellaneous helper functions (not wiki-dependent)""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id: $' + + +import threading +import time +import Queue + + +class ThreadedGenerator(threading.Thread): + """Look-ahead generator class. + + Runs a generator in a separate thread and queues the results; can + be called like a regular generator. + + Subclasses should override self.generator, I{not} self.run + + Important: the generator thread will stop itself if the generator's + internal queue is exhausted; but, if the calling program does not use + all the generated values, it must call the generator's stop() method to + stop the background thread. Example usage: + + >>> gen = ThreadedGenerator(target=foo) + >>> try: + ... for data in gen: + ... do_work(data) + ... finally: + ... gen.stop() + + """ + + def __init__(self, group=None, target=None, name="GeneratorThread", + args=(), kwargs=None, qsize=65536): + """Constructor. Takes same keyword arguments as threading.Thread. + + target must be a generator function (or other callable that returns + an iterable object). + + @param qsize: The size of the lookahead queue. The larger the qsize, + the more values will be computed in advance of use (which can eat + up memory and processor time). + @type qsize: int + + """ + if kwargs is None: + kwargs = {} + if target: + self.generator = target + if not hasattr(self, "generator"): + raise RuntimeError("No generator for ThreadedGenerator to run.") + self.args, self.kwargs = args, kwargs + threading.Thread.__init__(self, group=group, name=name) + self.queue = Queue.Queue(qsize) + self.finished = threading.Event() + + def __iter__(self): + """Iterate results from the queue.""" + if not self.isAlive() and not self.finished.isSet(): + self.start() + # if there is an item in the queue, yield it, otherwise wait + while not self.finished.isSet(): + try: + yield self.queue.get(True, 0.25) + except Queue.Empty: + pass + except KeyboardInterrupt: + self.stop() + + def stop(self): + """Stop the background thread.""" + self.finished.set() + + def run(self): + """Run the generator and store the results on the queue.""" + self.__gen = self.generator(*self.args, **self.kwargs) + for result in self.__gen: + while True: + if self.finished.isSet(): + return + try: + self.queue.put_nowait(result) + except Queue.Full: + time.sleep(0.25) + continue + break + # wait for queue to be emptied, then kill the thread + while not self.finished.isSet() and not self.queue.empty(): + time.sleep(0.25) + self.stop() + +
pywikipedia-l@lists.wikimedia.org