[Pywikipedia-l] SVN: [6136] branches/rewrite/pywikibot - pywikibot

10 Dec 2008

Revision: 6136
Author:   russblau
Date:     2008-12-10 18:56:59 +0000 (Wed, 10 Dec 2008)

Log Message:
-----------
Handling of command-line arguments

Modified Paths:
--------------
    branches/rewrite/pywikibot/__init__.py
    branches/rewrite/pywikibot/page.py
    branches/rewrite/pywikibot/site.py

Added Paths:
-----------
    branches/rewrite/pywikibot/bot.py
    branches/rewrite/pywikibot/pagegenerators.py

Modified: branches/rewrite/pywikibot/__init__.py
===================================================================

--- branches/rewrite/pywikibot/__init__.py	2008-12-09 22:38:39 UTC (rev 6135)
+++ branches/rewrite/pywikibot/__init__.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -11,10 +11,12 @@
 
 import sys
 import logging
+import re
 
 from exceptions import *
 import config2 as config
 import textlib
+from bot import handleArgs, showHelp
 
 
 def deprecate_arg(old_arg, new_arg):
@@ -97,10 +99,13 @@
 from page import Page, ImagePage, Category, Link
 
 
+link_regex = re.compile(r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]')
+
+
 # User interface functions (kept extremely simple for debugging)
 
-def output(text):
-    print text
+def output(text, toStdout=False):
+    print text.encode(config.console_encoding, "xmlcharrefreplace")
 
 def input(prompt, password=False):
     if isinstance(prompt, unicode):

Added: branches/rewrite/pywikibot/bot.py
===================================================================
--- branches/rewrite/pywikibot/bot.py	                        (rev 0)
+++ branches/rewrite/pywikibot/bot.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -0,0 +1,158 @@
+# -*- coding: utf-8  -*-
+"""
+User-interface related functions for building bots
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+# Note: the intention is to develop this module (at some point) into a Bot
+# class definition that can be subclassed to create new, functional bot
+# scripts, instead of writing each one from scratch.
+
+
+import os.path
+import sys
+import pywikibot
+from pywikibot import config2 as config
+
+
+def calledModuleName():
+    """Return the name of the module calling this function.
+
+    This is required because the -help option loads the module's docstring
+    and because the module name will be used for the filename of the log.
+
+    """
+    # get commandline arguments
+    called = sys.argv[0].strip()
+    if ".py" in called:  # could end with .pyc, .pyw, etc. on some platforms
+        called = called[ : called.rindex(".py")]
+    return os.path.basename(called)
+
+
+def _decodeArg(arg):
+    if sys.platform=='win32':
+        if config.console_encoding == 'cp850':
+            # Western Windows versions give parameters encoded as windows-1252
+            # even though the console encoding is cp850.
+            return unicode(arg, 'windows-1252')
+        elif config.console_encoding == 'cp852':
+            # Central/Eastern European Windows versions give parameters encoded
+            # as windows-1250 even though the console encoding is cp852.
+            return unicode(arg, 'windows-1250')
+        else:
+            return unicode(arg, config.console_encoding)
+    else:
+        # Linux uses the same encoding for both.
+        # I don't know how non-Western Windows versions behave.
+        return unicode(arg, config.console_encoding)
+
+
+def handleArgs(*args):
+    """Handle standard command line arguments, return the rest as a list.
+
+    Takes the commandline arguments, converts them to Unicode, processes all
+    global parameters such as -lang or -log. Returns a list of all arguments
+    that are not global. This makes sure that global arguments are applied
+    first, regardless of the order in which the arguments were given.
+
+    args may be passed as an argument, thereby overriding sys.argv
+
+    """
+    global verbose
+    # get commandline arguments if necessary
+    if not args:
+        args = sys.argv[1:]
+    # get the name of the module calling this function. This is
+    # required because the -help option loads the module's docstring and because
+    # the module name will be used for the filename of the log.
+    moduleName = calledModuleName()
+    nonGlobalArgs = []
+    for arg in args:
+        arg = _decodeArg(arg)
+        if arg == '-help':
+            showHelp(moduleName)
+            sys.exit(0)
+        elif arg.startswith('-family:'):
+            config.family = arg[8:]
+        elif arg.startswith('-lang:'):
+            config.code = arg[6:]
+        elif arg.startswith('-putthrottle:'):
+            config.put_throttle = int(arg[13:])
+        elif arg.startswith('-pt:'):
+            config.put_throttle = int(arg[4:])
+        elif arg == '-log':
+            setLogfileStatus(True) #FIXME
+        elif arg.startswith('-log:'):
+            setLogfileStatus(True, arg[5:]) #FIXME
+        elif arg == '-nolog':
+            setLogfileStatus(False) #FIXME
+        elif arg == '-verbose' or arg == "-v":
+            pywikibot.output(u'Pywikipediabot %s' % (version.getversion()))
+            pywikibot.output(u'Python %s' % (sys.version))
+            verbose += 1 # FIXME
+        elif arg == '-daemonize':
+            import daemonize
+            daemonize.daemonize()
+        elif arg.startswith('-daemonize:'):
+            import daemonize
+            daemonize.daemonize(redirect_std = arg[11:])
+        else:
+            # the argument is not global. Let the specific bot script care
+            # about it.
+            nonGlobalArgs.append(arg)
+    return nonGlobalArgs
+
+
+def showHelp():
+    moduleName = calledModuleName()
+    globalHelp =u'''\
+Global arguments available for all bots:
+
+-dir:PATH         Read the bot's configuration data from directory given by
+                  PATH, instead of from the default directory.
+
+-lang:xx          Set the language of the wiki you want to work on, overriding
+                  the configuration in user-config.py. xx should be the
+                  language code.
+
+-family:xyz       Set the family of the wiki you want to work on, e.g.
+                  wikipedia, wiktionary, wikitravel, ...
+                  This will override the configuration in user-config.py.
+
+-daemonize:xyz    Immediately returns control to the terminal and redirects
+                  stdout and stderr to xyz (only use for bots that require
+                  no input from stdin).
+
+-help             Shows this help text.
+
+-log              Enable the logfile. Logs will be stored in the logs
+                  subdirectory.
+
+-log:xyz          Enable the logfile, using xyz as the filename.
+
+-nolog            Disable the logfile (if it is enabled by default).
+
+-putthrottle:n    Set the minimum time (in seconds) the bot will wait between
+-pt:n             saving pages.
+
+-verbose          Have the bot provide additional output that may be useful in
+-v                debugging.
+'''
+    try:
+        exec('import %s as module' % moduleName)
+        helpText = module.__doc__.decode('utf-8')
+        if hasattr(module, 'docuReplacements'):
+            for key, value in module.docuReplacements.iteritems():
+                helpText = helpText.replace(key, value.strip('\n\r'))
+        pywikibot.output(helpText)
+    except:
+        pywikibot.output(u'Sorry, no help available for %s' % moduleName)
+        logging.exception('showHelp:')
+    pywikibot.output(globalHelp)
+
+

Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py	2008-12-09 22:38:39 UTC (rev 6135)
+++ branches/rewrite/pywikibot/page.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -12,6 +12,7 @@
 import pywikibot
 from pywikibot import deprecate_arg
 from pywikibot import config
+import pywikibot.site
 import pywikibot.textlib
 
 import htmlentitydefs
@@ -1325,6 +1326,8 @@
             titleWithSortKey = self.title(withSection=False)
         return '[[%s]]' % titleWithSortKey
 
+    @deprecate_arg("startFrom", None)
+    @deprecate_arg("cacheResults", None)
     def subcategories(self, recurse=False):
         """Iterate all subcategories of the current category.
 
@@ -1353,7 +1356,8 @@
                     for item in subcat.subcategories(recurse):
                         yield item
 
-    def articles(self, recurse=False, startFrom=None):
+    @deprecate_arg("startFrom", None)
+    def articles(self, recurse=False):
         """
         Yields all articles in the current category.
 
@@ -1367,7 +1371,7 @@
         namespaces = [x for x in self.site().namespaces().keys()
                       if x>=0 and x!=14]
         for member in self.site().categorymembers(self,
-                                                     namespaces=namespaces):
+                                                  namespaces=namespaces):
             yield member
         if recurse:
             if not isinstance(recurse, bool) and recurse:

Added: branches/rewrite/pywikibot/pagegenerators.py
===================================================================
--- branches/rewrite/pywikibot/pagegenerators.py	                        (rev 0)
+++ branches/rewrite/pywikibot/pagegenerators.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -0,0 +1,965 @@
+# -*- coding: utf-8  -*-
+"""This module offers a wide variety of page generators. A page generator
is an
+object that is iterable (see http://www.python.org/dev/peps/pep-0255/ ) and
+that yields page objects on which other scripts can then work.
+
+In general, there is no need to run this script directly. It can, however,
+be run for testing purposes. It will then print the page titles to standard
+output.
+
+These parameters are supported to specify which pages titles to print:
+
+&params;
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+import pywikibot
+
+import itertools
+import Queue
+import re
+import sys
+import threading
+
+
+# ported from version 1 for backwards-compatibility
+# most of these functions just wrap a Site or Page method that returns
+# a generator
+
+parameterHelp = """\
+-cat              Work on all pages which are in a specific category.
+                  Argument can also be given as "-cat:categoryname" or
+                  as "-cat:categoryname|fromtitle".
+
+-catr             Like -cat, but also recursively includes pages in
+                  subcategories, sub-subcategories etc. of the
+                  given category.
+                  Argument can also be given as "-catr:categoryname" or
+                  as "-catr:categoryname|fromtitle".
+
+-subcats          Work on all subcategories of a specific category.
+                  Argument can also be given as "-subcats:categoryname" or
+                  as "-subcats:categoryname|fromtitle".
+
+-subcatsr         Like -subcats, but also includes sub-subcategories etc. of
+                  the given category.
+                  Argument can also be given as "-subcatsr:categoryname" or
+                  as "-subcatsr:categoryname|fromtitle".
+
+-uncat            Work on all pages which are not categorised.
+
+-uncatcat         Work on all categories which are not categorised.
+
+-uncatfiles       Work on all files which are not categorised.
+
+-file             Read a list of pages to treat from the named text file.
+                  Page titles in the file must be enclosed with [[brackets]].
+                  Argument can also be given as "-file:filename".
+
+-filelinks        Work on all pages that use a certain image/media file.
+                  Argument can also be given as "-filelinks:filename".
+
+-yahoo            Work on all pages that are found in a Yahoo search.
+                  Depends on python module pYsearch.  See yahoo_appid in
+                  config.py for instructions.
+
+-search           Work on all pages that are found in a MediaWiki search
+                  across all namespaces.
+
+-google           Work on all pages that are found in a Google search.
+                  You need a Google Web API license key. Note that Google
+                  doesn't give out license keys anymore. See google_key in
+                  config.py for instructions.
+                  Argument can also be given as "-google:searchstring".
+
+-interwiki        Work on the given page and all equivalent pages in other
+                  languages. This can, for example, be used to fight
+                  multi-site spamming.
+                  Attention: this will cause the bot to modify
+                  pages on several wiki sites, this is not well tested,
+                  so check your edits!
+
+-links            Work on all pages that are linked from a certain page.
+                  Argument can also be given as "-links:linkingpagetitle".
+
+-new              Work on the 60 newest pages. If given as -new:x, will work
+                  on the x newest pages.
+
+-imagelinks       Work on all images that are linked from a certain page.
+                  Argument can also be given as
"-imagelinks:linkingpagetitle".
+
+-newimages        Work on the 100 newest images. If given as -newimages:x,
+                  will work on the x newest images.
+
+-ref              Work on all pages that link to a certain page.
+                  Argument can also be given as "-ref:referredpagetitle".
+
+-start            Specifies that the robot should go alphabetically through
+                  all pages on the home wiki, starting at the named page.
+                  Argument can also be given as "-start:pagetitle".
+
+                  You can also include a namespace. For example,
+                  "-start:Template:!" will make the bot work on all pages
+                  in the template namespace.
+
+-prefixindex      Work on pages commencing with a common prefix.
+
+-regex            Obsolete, use -titleregex
+
+-titleregex       Work on titles that match the given regular expression.
+
+-transcludes      Work on all pages that use a certain template.
+                  Argument can also be given as "-transcludes:Template:Title".
+
+-unusedfiles      Work on all description pages of images/media files that are
+                  not used anywhere.
+                  Argument can be given as "-unusedfiles:n" where
+                  n is the maximum number of articles to work on.
+
+-unwatched        Work on all articles that are not watched by anyone.
+                  Argument can be given as "-unwatched:n" where
+                  n is the maximum number of articles to work on.
+
+-usercontribs     Work on all articles that were edited by a certain user :
+                  Example : -usercontribs:DumZiBoT
+
+-weblink          Work on all articles that contain an external link to
+                  a given URL; may be given as "-weblink:url"
+
+-withoutinterwiki Work on all pages that don't have interlanguage links.
+                  Argument can be given as "-withoutinterwiki:n" where
+                  n is some number (??).
+"""
+
+docuReplacements = {'&params;': parameterHelp}
+
+# if a bot uses GeneratorFactory, the module should include the line
+#   docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}
+# and include the marker &params; in the module's docstring
+
+
+class GeneratorFactory(object):
+    """Process command line arguments and return appropriate page
generator."""
+
+    def setCategoryGen(self, arg, length, recurse = False):
+        if len(arg) == length:
+            categoryname = pywikibot.input(u'Please enter the category name:')
+        else:
+            categoryname = arg[length + 1:]
+
+        ind = categoryname.find('|')
+        if ind > 0:
+            startfrom = categoryname[ind + 1:]
+            categoryname = categoryname[:ind]
+        else:
+            startfrom = None
+
+        cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
+        return CategorizedPageGenerator(cat, start=startfrom, recurse=recurse)
+
+    def setSubCategoriesGen(self, arg, length, recurse=False):
+        if len(arg) == length:
+            categoryname = pywikibot.input(u'Please enter the category name:')
+        else:
+            categoryname = arg[length + 1:]
+
+        ind = categoryname.find('|')
+        if ind > 0:
+            startfrom = categoryname[ind + 1:]
+            categoryname = categoryname[:ind]
+        else:
+            startfrom = None
+
+        cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
+        return SubCategoriesPageGenerator(cat, start=startfrom, recurse=recurse)
+
+    def handleArg(self, arg):
+        gen = None
+        if arg.startswith('-filelinks'):
+            fileLinksPageTitle = arg[11:]
+            if not fileLinksPageTitle:
+                fileLinksPageTitle = pywikibot.input(
+                    u'Links to which image page should be processed?')
+            if fileLinksPageTitle.startswith(pywikibot.Site().namespace(6)
+                                             + ":"):
+                fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
+                                                    fileLinksPageTitle)
+            else:
+                fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
+                                                    'Image:' +
+                                                    fileLinksPageTitle)
+            gen = FileLinksGenerator(fileLinksPage)
+        elif arg.startswith('-unusedfiles'):
+            if len(arg) == 12:
+                gen = UnusedFilesGenerator()
+            else:
+                gen = UnusedFilesGenerator(number = int(arg[13:]))
+        elif arg.startswith('-unwatched'):
+            if len(arg) == 10:
+                gen = UnwatchedPagesPageGenerator()
+            else:
+                gen = UnwatchedPagesPageGenerator(number = int(arg[11:]))
+        elif arg.startswith('-usercontribs'):
+            gen = UserContributionsGenerator(arg[14:])
+        elif arg.startswith('-withoutinterwiki'):
+            if len(arg) == 17:
+                gen = WithoutInterwikiPageGenerator()
+            else:
+                gen = WithoutInterwikiPageGenerator(number = int(arg[18:]))
+        elif arg.startswith('-interwiki'):
+            title = arg[11:]
+            if not title:
+                title = pywikibot.input(u'Which page should be processed?')
+            page = pywikibot.Page(pywikibot.Site(), title)
+            gen = InterwikiPageGenerator(page)
+        elif arg.startswith('-file'):
+            textfilename = arg[6:]
+            if not textfilename:
+                textfilename = pywikibot.input(
+                    u'Please enter the local file name:')
+            gen = TextfilePageGenerator(textfilename)
+        elif arg.startswith('-catr'):
+            gen = self.setCategoryGen(arg, 5, recurse = True)
+        elif arg.startswith('-cat'):
+            gen = self.setCategoryGen(arg, 4)
+        elif arg.startswith('-subcatsr'):
+            gen = self.setSubCategoriesGen(arg, 9, recurse = True)
+        elif arg.startswith('-subcats'):
+            gen = self.setSubCategoriesGen(arg, 8)
+        elif arg.startswith('-uncatfiles'):
+            gen = UnCategorizedImageGenerator()
+        elif arg.startswith('-uncatcat'):
+            gen = UnCategorizedCategoryGenerator()
+        elif arg.startswith('-uncat'):
+            gen = UnCategorizedPageGenerator()
+        elif arg.startswith('-ref'):
+            referredPageTitle = arg[5:]
+            if not referredPageTitle:
+                referredPageTitle = pywikibot.input(
+                    u'Links to which page should be processed?')
+            referredPage = pywikibot.Page(pywikibot.Site(), referredPageTitle)
+            gen = ReferringPageGenerator(referredPage)
+        elif arg.startswith('-links'):
+            linkingPageTitle = arg[7:]
+            if not linkingPageTitle:
+                linkingPageTitle = pywikibot.input(
+                    u'Links from which page should be processed?')
+            linkingPage = pywikibot.Page(pywikibot.Site(), linkingPageTitle)
+            gen = LinkedPageGenerator(linkingPage)
+        elif arg.startswith('-weblink'):
+            url = arg[9:]
+            if not url:
+                url = pywikibot.input(
+                    u'Pages with which weblink should be processed?')
+            gen = LinksearchPageGenerator(url)
+        elif arg.startswith('-transcludes'):
+            transclusionPageTitle = arg[len('-transcludes:'):]
+            if not transclusionPageTitle:
+                transclusionPageTitle = pywikibot.input(
+                    u'Pages that transclude which page should be processed?')
+            transclusionPage = pywikibot.Page(pywikibot.Site(),
+                                    'Template:%s' % transclusionPageTitle)
+            gen = ReferringPageGenerator(transclusionPage,
+                                         onlyTemplateInclusion=True)
+        elif arg.startswith('-start'):
+            if arg.startswith('-startxml'):
+                pywikibot.output(u'-startxml : wrong parameter')
+                raise ValueError
+            firstPageTitle = arg[7:]
+            if not firstPageTitle:
+                firstPageTitle = pywikibot.input(
+                    u'At which page do you want to start?')
+            namespace = pywikibot.Page(pywikibot.Site(),
+                                       firstPageTitle).namespace()
+            firstPageTitle = pywikibot.Page(pywikibot.link(firstPageTitle)
+                                           ).titleWithoutNamespace()
+            gen = AllpagesPageGenerator(firstPageTitle, namespace,
+                                        includeredirects=False)
+        elif arg.startswith('-prefixindex'):
+            prefix = arg[13:]
+            namespace = None
+            if not prefix:
+                prefix = pywikibot.input(
+                    u'What page names are you looking for?')
+            gen = PrefixingPageGenerator(prefix=prefix)
+        elif arg.startswith('-newimages'):
+            limit = arg[11:] or pywikibot.input(
+                u'How many images do you want to load?')
+            gen = NewimagesPageGenerator(number=int(limit))
+        elif arg.startswith('-new'):
+            if len(arg) >=5:
+              gen = NewpagesPageGenerator(number=int(arg[5:]))
+            else:
+              gen = NewpagesPageGenerator(number=60)
+        elif arg.startswith('-imagelinks'):
+            imagelinkstitle = arg[len('-imagelinks:'):]
+            if not imagelinkstitle:
+                imagelinkstitle = pywikibot.input(
+                    u'Images on which page should be processed?')
+            imagelinksPage = pywikibot.Page(pywikibot.Link(imagelinkstitle))
+            gen = ImagesPageGenerator(imagelinksPage)
+        elif arg.startswith('-search'):
+            mediawikiQuery = arg[8:]
+            if not mediawikiQuery:
+                mediawikiQuery = pywikibot.input(
+                    u'What do you want to search for?')
+            # In order to be useful, all namespaces are required
+            gen = SearchPageGenerator(mediawikiQuery, namespaces = [])
+        elif arg.startswith('-google'):
+            gen = GoogleSearchPageGenerator(arg[8:])
+        elif arg.startswith('-titleregex'):
+            if len(arg) == 6:
+                regex = pywikibot.input(
+                    u'What page names are you looking for?')
+            else:
+                regex = arg[7:]
+            gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex)
+        elif arg.startswith('-yahoo'):
+            gen = YahooSearchPageGenerator(arg[7:])
+        else:
+            return None
+        # make sure all yielded pages are unique
+        gen = DuplicateFilterPageGenerator(gen)
+        return gen
+
+
+class ThreadedGenerator(threading.Thread):
+    """Look-ahead generator class.
+
+    Runs a generator in a separate thread and queues the results; can
+    be called like a regular generator.
+
+    Subclasses should override self.generator, _not_ self.run
+
+    Important: the generator thread will stop itself if the generator's
+    internal queue is exhausted; but, if the calling program does not use
+    all the generated values, it must call the generator's stop() method to
+    stop the background thread.  Example usage:
+
+    >>> gen = ThreadedGenerator(target=foo)
+    >>> try:
+    ...     for data in gen:
+    ...         do_work(data)
+    ... finally:
+    ...     gen.stop()
+
+    """ #NOT CURRENTLY USED: Intended for future development
+
+    def __init__(self, group=None, target=None, name="GeneratorThread",
+                 args=(), kwargs=None, qsize=65536):
+        """Constructor.  Takes same keyword arguments as
threading.Thread.
+
+        target must be a generator function (or other callable that returns
+        an iterable object).
+
+        @param qsize: The size of the lookahead queue. The larger the qsize,
+        the more values will be computed in advance of use (which can eat
+        up memory and processor time).
+        @type qsize: int
+
+        """
+        if kwargs is None:
+            kwargs = {}
+        if target:
+            self.generator = target
+        if not hasattr(self, "generator"):
+            raise RuntimeError("No generator for ThreadedGenerator to run.")
+        self.args, self.kwargs = args, kwargs
+        threading.Thread.__init__(self, group=group, name=name)
+        self.queue = Queue.Queue(qsize)
+        self.finished = threading.Event()
+
+    def __iter__(self):
+        """Iterate results from the queue."""
+        if not self.isAlive() and not self.finished.isSet():
+            self.start()
+        # if there is an item in the queue, yield it, otherwise wait
+        while not self.finished.isSet():
+            try:
+                yield self.queue.get(True, 0.25)
+            except Queue.Empty:
+                pass
+            except KeyboardInterrupt:
+                self.stop()
+
+    def stop(self):
+        """Stop the background thread."""
+##        if not self.finished.isSet():
+##            pywikibot.output("DEBUG: signalling %s to stop." % self)
+        self.finished.set()
+
+    def run(self):
+        """Run the generator and store the results on the
queue."""
+        self.__gen = self.generator(*self.args, **self.kwargs)
+        for result in self.__gen:
+            while True:
+                if self.finished.isSet():
+##                    pywikibot.output("DEBUG: %s received stop signal." %
self)
+                    return
+                try:
+                    self.queue.put_nowait(result)
+                except Queue.Full:
+                    time.sleep(0.25)
+                    continue
+                break
+        # wait for queue to be emptied, then kill the thread
+        while not self.finished.isSet() and not self.queue.empty():
+            time.sleep(0.25)
+        self.stop()
+##        pywikibot.output("DEBUG: %s stopped because generator exhausted." %
self)
+
+
+def AllpagesPageGenerator(start ='!', namespace=None, includeredirects=True,
+                          site=None):
+    """
+    Using the Allpages special page, retrieve all articles' titles, and yield
+    page objects.
+    If includeredirects is False, redirects are not included. If
+    includeredirects equals the string 'only', only redirects are added.
+    """
+    if site is None:
+        site = pywikibot.getSite()
+    if includeredirects:
+        if includeredirects == 'only':
+            filterredir = True
+        else:
+            filterredir = None
+    else:
+        filterredir = False
+    return site.allpages(start=start, namespace=namespace,
+                         filterredir=filterredir)
+
+
+def PrefixingPageGenerator(prefix, namespace=None, includeredirects=True,
+                           site=None):
+    if site is None:
+        site = pywikibot.Site()
+    page = pywikibot.Page(site, prefix)
+    if namespace is None:
+        namespace = page.namespace()
+    title = page.titleWithoutNamespace()
+    if includeredirects:
+        if includeredirects == 'only':
+            filterredir = True
+        else:
+            filterredir = None
+    else:
+        filterredir = False
+    return site.allpages(prefix=title, namespace=namespace,
+                         filterredir=filterredir)
+
+
+def NewpagesPageGenerator(number=100, get_redirect=False, repeat=False,
+                          site=None):
+    # API does not (yet) have a newpages function, so this tries to duplicate
+    # it by filtering the recentchanges output
+    # defaults to namespace 0 because that's how Special:Newpages defaults
+    if site is None:
+        site = pywikibot.Site()
+    return site.recentchanges(limit=number, showredirects=get_redirect,
+                              changetype="new", namespaces=0)
+
+
+def FileLinksGenerator(referredImagePage):
+    return referredImagePage.usingPages()
+
+
+def ImagesPageGenerator(pageWithImages):
+    return pageWithImages.imagelinks()
+
+
+def InterwikiPageGenerator(page):
+    """Iterator over all interwiki (non-language) links on a
page."""
+    for link in page.interwiki():
+        yield pywikibot.Page(link)
+
+
+def LanguageLinksPageGenerator(page):
+    """Iterator over all interwiki language links on a
page."""
+    for link in page.langlinks():
+        yield pywikibot.Page(link)
+
+
+def ReferringPageGenerator(referredPage, followRedirects=False,
+                           withTemplateInclusion=True,
+                           onlyTemplateInclusion=False):
+    '''Yields all pages referring to a specific page.'''
+    return referredPage.getReferences(
+                follow_redirects=followRedirects,
+                withTemplateInclusion=withTemplateInclusion,
+                onlyTemplateInclusion=onlyTemplateInclusion)
+
+
+def CategorizedPageGenerator(category, recurse=False, start=None):
+    '''Yield all pages in a specific category.
+
+    If recurse is True, pages in subcategories are included as well; if
+    recurse is an int, only subcategories to that depth will be included
+    (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
+    not go any further).
+    If start is a string value, only pages whose sortkey comes after start
+    alphabetically are included.
+
+    ''' # TODO: page generator could be modified to use cmstartsortkey ...
+    for a in category.articles(recurse=recurse):
+        if start is None or a.title(withNamespace=False) >= start:
+            yield a
+
+
+def SubCategoriesPageGenerator(category, recurse=False, start=None):
+    '''Yields all subcategories in a specific category.
+
+    If recurse is True, pages in subcategories are included as well; if
+    recurse is an int, only subcategories to that depth will be included
+    (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
+    not go any further).
+    If start is a string value, only categories whose sortkey comes after
+    start alphabetically are included.
+
+    ''' # TODO: page generator could be modified to use cmstartsortkey ...
+    for s in category.subcategories(recurse=recurse):
+        if start is None or s.title(withNamespace=False) >= start:
+            yield s
+
+
+def LinkedPageGenerator(linkingPage):
+    """Yields all pages linked from a specific page."""
+    return linkingPage.linkedPages()
+
+
+def TextfilePageGenerator(filename=None, site=None):
+    """Iterate pages from a list in a text file.
+
+    The file must contain page links between double-square-brackets.  The
+    generator will yield each corresponding Page object.
+
+    @param filename: the name of the file that should be read. If no name is
+        given, the generator prompts the user.
+    @param site: the default Site for which Page objects should be created
+
+    """
+    if filename is None:
+        filename = pywikibot.input(u'Please enter the filename:')
+    if site is None:
+        site = pywikibot.Site()
+    f = codecs.open(filename, 'r', config.textfile_encoding)
+    for linkmatch in Rlink.finditer(f.read()):
+        # If the link is in interwiki format, the Page object may reside
+        # on a different Site than the default.
+        # This makes it possible to work on different wikis using a single
+        # text file, but also could be dangerous because you might
+        # inadvertently change pages on another wiki!
+        yield pywikibot.Page(pywikibot.Link(linkmatch.groups("title"), site))
+    f.close()
+
+
+def PagesFromTitlesGenerator(iterable, site=None):
+    """Generate pages from the titles (unicode strings) yielded by
iterable."""
+    if site is None:
+        site = pywikibot.Site()
+    for title in iterable:
+        if not isinstance(title, basestring):
+            break
+        yield pywikibot.Page(pywikibot.Link(title, site))
+
+
+def UserContributionsGenerator(username, number=250, namespaces=None,
+                               site=None):
+    """Yields number unique pages edited by user:username
+    namespaces : list of namespace numbers to fetch contribs from
+
+    """
+    if site is None:
+        site = pywikibot.Site()
+    return site.usercontribs(user=username, limit=number, namespaces=namespaces)
+
+
+def NamespaceFilterPageGenerator(generator, namespaces, site=None):
+    """
+    Wraps around another generator. Yields only those pages that are in one
+    of the given namespaces.
+
+    The namespace list can contain both integers (namespace numbers) and
+    strings/unicode strings (namespace names).
+
+    """
+    if site is None:
+        site = pywikibot.Site()
+    # convert namespace names to namespace numbers
+    for i in xrange(len(namespaces)):
+        ns = namespaces[i]
+        if isinstance(ns, basestring):
+            index = site.getNamespaceIndex(ns)
+            if index is None:
+                raise ValueError(u'Unknown namespace: %s' % ns)
+            namespaces[i] = index
+    for page in generator:
+        if page.namespace() in namespaces:
+            yield page
+
+
+def RedirectFilterPageGenerator(generator):
+    """Yields pages from another generator that are not
redirects."""
+    for page in generator:
+        if not page.isRedirectPage():
+            yield page
+
+
+def DuplicateFilterPageGenerator(generator):
+    """Yield all unique pages from another generator, omitting
duplicates."""
+    seenPages = {}
+    for page in generator:
+        if page not in seenPages:
+            seenPages[page] = None
+            yield page
+
+
+def RegexFilterPageGenerator(generator, regex):
+    """Yield pages from another generator whose titles match
regex."""
+    reg = re.compile(regex, re.I)
+    for page in generator:
+        if reg.match(page.titleWithoutNamespace()):
+            yield page
+
+
+def CombinedPageGenerator(generators):
+    return itertools.chain(*generators)
+
+
+def CategoryGenerator(generator):
+    """Yield pages from another generator as Category objects.
+
+    Makes sense only if it is ascertained that only categories are being
+    retrieved.
+
+    """
+    for page in generator:
+        yield pywikibot.Category(page)
+
+
+def PageWithTalkPageGenerator(generator):
+    """
+    Wraps around another generator. Yields the same pages, but for non-talk
+    pages, it also includes associated talk pages.
+    This generator does not check if the talk page in fact exists.
+    """
+    for page in generator:
+        yield page
+        if not page.isTalkPage():
+            yield page.toggleTalkPage()
+
+
+def PreloadingGenerator(self, generator, pageNumber=60, lookahead=10):
+    """Yield preloaded pages taken from another
generator."""
+
+    # pages may be on more than one site, for example if an interwiki
+    # generator is used, so use a separate preloader for each site
+    sites = {}
+    # build a list of pages for each site found in the iterator
+    for page in generator:
+        sites.setdefault(page.site(), []).append(page)
+    return itertools.chain(site.preloadpages(sites[site], pageNumber)
+                           for site in sites)
+
+
+#TODO below
+
+def UnusedFilesGenerator(number=100, repeat=False, site=None, extension=None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.unusedfiles(number=number, repeat=repeat,
+                                 extension=extension):
+        yield pywikibot.ImagePage(page.site(), page.title())
+
+def WithoutInterwikiPageGenerator(number=100, repeat=False, site=None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.withoutinterwiki(number=number, repeat=repeat):
+        yield page
+
+def UnCategorizedCategoryGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.uncategorizedcategories(number=number, repeat=repeat):
+        yield page
+
+def UnCategorizedImageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.uncategorizedimages(number=number, repeat=repeat):
+        yield page
+
+def NewimagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.newimages(number, repeat=repeat):
+        yield page[0]
+
+def UnCategorizedPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.uncategorizedpages(number=number, repeat=repeat):
+        yield page
+
+def LonelyPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.lonelypages(number=number, repeat=repeat):
+        yield page
+
+def UnwatchedPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.unwatchedpages(number=number, repeat=repeat):
+        yield page
+
+def AncientPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.ancientpages(number=number, repeat=repeat):
+        yield page[0]
+
+def DeadendPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.deadendpages(number=number, repeat=repeat):
+        yield page
+
+def LongPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.longpages(number=number, repeat=repeat):
+        yield page[0]
+
+def ShortPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.shortpages(number=number, repeat=repeat):
+        yield page[0]
+
+def LinksearchPageGenerator(link, step=500, site=None):
+    """Yields all pages that include a specified link, according to
+    [[Special:Linksearch]].
+
+    """
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.linksearch(link, limit=step):
+        yield page
+
+def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
+    """
+    Provides a list of results using the internal MediaWiki search engine
+    """
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.search(query, number=number, namespaces = namespaces):
+        yield page[0]
+
+class YahooSearchPageGenerator:
+    '''
+    To use this generator, install pYsearch
+    '''
+    def __init__(self, query = None, count = 100, site = None): # values larger than 100
fail
+        self.query = query or pywikibot.input(u'Please enter the search query:')
+        self.count = count
+        if site is None:
+            site = pywikibot.Site()
+        self.site = site
+
+    def queryYahoo(self, query):
+       from yahoo.search.web import WebSearch
+       srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
+
+       dom = srch.get_results()
+       results = srch.parse_results(dom)
+       for res in results:
+           url = res.Url
+           yield url
+
+    def __iter__(self):
+        # restrict query to local site
+        localQuery = '%s site:%s' % (self.query, self.site.hostname())
+        base = 'http://%s%s' % (self.site.hostname(),
self.site.nice_get_address(''))
+        for url in self.queryYahoo(localQuery):
+            if url[:len(base)] == base:
+                title = url[len(base):]
+                page = pywikibot.Page(self.site, title)
+                yield page
+
+class GoogleSearchPageGenerator:
+    '''
+    To use this generator, you must install the pyGoogle module from
+    http://pygoogle.sf.net/ and get a Google Web API license key from
+    http://www.google.com/apis/index.html . The google_key must be set to your
+    license key in your configuration.
+    '''
+    def __init__(self, query = None, site = None):
+        self.query = query or pywikibot.input(u'Please enter the search query:')
+        if site is None:
+            site = pywikibot.Site()
+        self.site = site
+
+    #########
+    # partially commented out because it is probably not in compliance with Google's
"Terms of
+    # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US)
+    def queryGoogle(self, query):
+        #if config.google_key:
+        if True:
+            #try:
+                for url in self.queryViaSoapApi(query):
+                    yield url
+                return
+            #except ImportError:
+                #pass
+        # No google license key, or pygoogle not installed. Do it the ugly way.
+        #for url in self.queryViaWeb(query):
+        #    yield url
+
+    def queryViaSoapApi(self, query):
+        import google
+        google.LICENSE_KEY = config.google_key
+        offset = 0
+        estimatedTotalResultsCount = None
+        while not estimatedTotalResultsCount or offset < estimatedTotalResultsCount:
+            while (True):
+                # Google often yields 502 errors.
+                try:
+                    pywikibot.output(u'Querying Google, offset %i' % offset)
+                    data = google.doGoogleSearch(query, start = offset, filter = False)
+                    break
+                except KeyboardInterrupt:
+                    raise
+                except:
+                    # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway)
+                    # can happen here, depending on the module used. It's not easy
+                    # to catch this properly because pygoogle decides which one of
+                    # the soap modules to use.
+                    pywikibot.output(u"An error occured. Retrying in 10
seconds...")
+                    time.sleep(10)
+                    continue
+
+            for result in data.results:
+                #print 'DBG: ', result.URL
+                yield result.URL
+            # give an estimate of pages to work on, but only once.
+            if not estimatedTotalResultsCount:
+                pywikibot.output(u'Estimated total result count: %i pages.' %
data.meta.estimatedTotalResultsCount)
+            estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount
+            #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount
+            offset += 10
+
+    #########
+    # commented out because it is probably not in compliance with Google's
"Terms of
+    # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US)
+
+    #def queryViaWeb(self, query):
+        #"""
+        #Google has stopped giving out API license keys, and sooner or later
+        #they will probably shut down the service.
+        #This is a quick and ugly solution: we just grab the search results from
+        #the normal web interface.
+        #"""
+        #linkR = re.compile(r'<a href="([^>"]+?)"
class=l>', re.IGNORECASE)
+        #offset = 0
+
+        #while True:
+            #pywikibot.output("Google: Querying page %d" % (offset / 100 + 1))
+            #address =
"http://www.google.com/search?q=%s&num=100&hl=en&start=%d" %
(urllib.quote_plus(query), offset)
+            ## we fake being Firefox because Google blocks unknown browsers
+            #request = urllib2.Request(address, None, {'User-Agent':
'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1
Firefox/1.5'})
+            #urlfile = urllib2.urlopen(request)
+            #page = urlfile.read()
+            #urlfile.close()
+            #for url in linkR.findall(page):
+                #yield url
+            #if "<div id=nn>" in page: # Is there a "Next" link
for next page of results?
+                #offset += 100  # Yes, go to next page of results.
+            #else:
+                #return
+    #########
+
+    def __iter__(self):
+        # restrict query to local site
+        localQuery = '%s site:%s' % (self.query, self.site.hostname())
+        base = 'http://%s%s' % (self.site.hostname(),
self.site.nice_get_address(''))
+        for url in self.queryGoogle(localQuery):
+            if url[:len(base)] == base:
+                title = url[len(base):]
+                page = pywikibot.Page(self.site, title)
+                # Google contains links in the format
http://de.wikipedia.org/wiki/en:Foobar
+                if page.site() == self.site:
+                    yield page
+
+def MySQLPageGenerator(query, site = None):
+    import MySQLdb as mysqldb
+    if site is None:
+        site = pywikibot.Site()
+    conn = mysqldb.connect(config.db_hostname, db = site.dbName(),
+                           user = config.db_username,
+                           passwd = config.db_password)
+    cursor = conn.cursor()
+    pywikibot.output(u'Executing query:\n%s' % query)
+    query = query.encode(site.encoding())
+    cursor.execute(query)
+    while True:
+        try:
+            namespaceNumber, pageName = cursor.fetchone()
+            print namespaceNumber, pageName
+        except TypeError:
+            # Limit reached or no more results
+            break
+        #print pageName
+        if pageName:
+            namespace = site.namespace(namespaceNumber)
+            pageName = unicode(pageName, site.encoding())
+            if namespace:
+                pageTitle = '%s:%s' % (namespace, pageName)
+            else:
+                pageTitle = pageName
+            page = pywikibot.Page(site, pageTitle)
+            yield page
+
+def YearPageGenerator(start = 1, end = 2050, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    pywikibot.output(u"Starting with year %i" % start)
+    for i in xrange(start, end + 1):
+        if i % 100 == 0:
+            pywikibot.output(u'Preparing %i...' % i)
+        # There is no year 0
+        if i != 0:
+            current_year = date.formatYear(site.lang, i )
+            yield pywikibot.Page(site, current_year)
+
+def DayPageGenerator(startMonth = 1, endMonth = 12, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    fd = date.FormatDate(site)
+    firstPage = pywikibot.Page(site, fd(startMonth, 1))
+    pywikibot.output(u"Starting with %s" % firstPage.aslink())
+    for month in xrange(startMonth, endMonth+1):
+        for day in xrange(1, date.getNumberOfDaysInMonth(month)+1):
+            yield pywikibot.Page(site, fd(month, day))
+
+
+if __name__ == "__main__":
+    try:
+        gen = None
+        genFactory = GeneratorFactory()
+        for arg in pywikibot.handleArgs():
+            generator = genFactory.handleArg(arg)
+            if generator:
+                gen = generator
+        if gen:
+            for page in gen:
+                pywikibot.output(page.title(), toStdout = True)
+        else:
+            pywikibot.showHelp()
+    finally:
+        pywikibot.stopme()

Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py	2008-12-09 22:38:39 UTC (rev 6135)
+++ branches/rewrite/pywikibot/site.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -1,4 +1,4 @@
-# -*- coding: utf-8  -*-
+ # -*- coding: utf-8  -*-
 """
 Objects representing MediaWiki sites (wikis) and families (groups of wikis
 on the same topic in different languages).