[Pywikipedia-l] SVN: [6136] branches/rewrite/pywikibot

10 Dec 2008

Revision: 6136
Author:   russblau
Date:     2008-12-10 18:56:59 +0000 (Wed, 10 Dec 2008)
Log Message:
-----------
Handling of command-line arguments
Modified Paths:
--------------
    branches/rewrite/pywikibot/__init__.py
    branches/rewrite/pywikibot/page.py
    branches/rewrite/pywikibot/site.py
Added Paths:
-----------
    branches/rewrite/pywikibot/bot.py
    branches/rewrite/pywikibot/pagegenerators.py
Modified: branches/rewrite/pywikibot/__init__.py
===================================================================

--- branches/rewrite/pywikibot/__init__.py	2008-12-09 22:38:39 UTC (rev 6135)
+++ branches/rewrite/pywikibot/__init__.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -11,10 +11,12 @@
import sys
 import logging
+import re
from exceptions import *
 import config2 as config
 import textlib
+from bot import handleArgs, showHelp
def deprecate_arg(old_arg, new_arg):
@@ -97,10 +99,13 @@
 from page import Page, ImagePage, Category, Link
+link_regex = re.compile(r'[[(?P<title>[^]|[#<>{}]*)(|.*?)?]]')
+
+
 # User interface functions (kept extremely simple for debugging)
-def output(text):
-    print text
+def output(text, toStdout=False):
+    print text.encode(config.console_encoding, "xmlcharrefreplace")
def input(prompt, password=False):
     if isinstance(prompt, unicode):
Added: branches/rewrite/pywikibot/bot.py
===================================================================
--- branches/rewrite/pywikibot/bot.py	                        (rev 0)
+++ branches/rewrite/pywikibot/bot.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -0,0 +1,158 @@
+# -*- coding: utf-8  -*-
+"""
+User-interface related functions for building bots
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+# Note: the intention is to develop this module (at some point) into a Bot
+# class definition that can be subclassed to create new, functional bot
+# scripts, instead of writing each one from scratch.
+
+
+import os.path
+import sys
+import pywikibot
+from pywikibot import config2 as config
+
+
+def calledModuleName():
+    """Return the name of the module calling this function.
+
+    This is required because the -help option loads the module's docstring
+    and because the module name will be used for the filename of the log.
+
+    """
+    # get commandline arguments
+    called = sys.argv[0].strip()
+    if ".py" in called:  # could end with .pyc, .pyw, etc. on some platforms
+        called = called[ : called.rindex(".py")]
+    return os.path.basename(called)
+
+
+def _decodeArg(arg):
+    if sys.platform=='win32':
+        if config.console_encoding == 'cp850':
+            # Western Windows versions give parameters encoded as windows-1252
+            # even though the console encoding is cp850.
+            return unicode(arg, 'windows-1252')
+        elif config.console_encoding == 'cp852':
+            # Central/Eastern European Windows versions give parameters encoded
+            # as windows-1250 even though the console encoding is cp852.
+            return unicode(arg, 'windows-1250')
+        else:
+            return unicode(arg, config.console_encoding)
+    else:
+        # Linux uses the same encoding for both.
+        # I don't know how non-Western Windows versions behave.
+        return unicode(arg, config.console_encoding)
+
+
+def handleArgs(*args):
+    """Handle standard command line arguments, return the rest as a list.
+
+    Takes the commandline arguments, converts them to Unicode, processes all
+    global parameters such as -lang or -log. Returns a list of all arguments
+    that are not global. This makes sure that global arguments are applied
+    first, regardless of the order in which the arguments were given.
+
+    args may be passed as an argument, thereby overriding sys.argv
+
+    """
+    global verbose
+    # get commandline arguments if necessary
+    if not args:
+        args = sys.argv[1:]
+    # get the name of the module calling this function. This is
+    # required because the -help option loads the module's docstring and because
+    # the module name will be used for the filename of the log.
+    moduleName = calledModuleName()
+    nonGlobalArgs = []
+    for arg in args:
+        arg = _decodeArg(arg)
+        if arg == '-help':
+            showHelp(moduleName)
+            sys.exit(0)
+        elif arg.startswith('-family:'):
+            config.family = arg[8:]
+        elif arg.startswith('-lang:'):
+            config.code = arg[6:]
+        elif arg.startswith('-putthrottle:'):
+            config.put_throttle = int(arg[13:])
+        elif arg.startswith('-pt:'):
+            config.put_throttle = int(arg[4:])
+        elif arg == '-log':
+            setLogfileStatus(True) #FIXME
+        elif arg.startswith('-log:'):
+            setLogfileStatus(True, arg[5:]) #FIXME
+        elif arg == '-nolog':
+            setLogfileStatus(False) #FIXME
+        elif arg == '-verbose' or arg == "-v":
+            pywikibot.output(u'Pywikipediabot %s' % (version.getversion()))
+            pywikibot.output(u'Python %s' % (sys.version))
+            verbose += 1 # FIXME
+        elif arg == '-daemonize':
+            import daemonize
+            daemonize.daemonize()
+        elif arg.startswith('-daemonize:'):
+            import daemonize
+            daemonize.daemonize(redirect_std = arg[11:])
+        else:
+            # the argument is not global. Let the specific bot script care
+            # about it.
+            nonGlobalArgs.append(arg)
+    return nonGlobalArgs
+
+
+def showHelp():
+    moduleName = calledModuleName()
+    globalHelp =u'''\
+Global arguments available for all bots:
+
+-dir:PATH         Read the bot's configuration data from directory given by
+                  PATH, instead of from the default directory.
+
+-lang:xx          Set the language of the wiki you want to work on, overriding
+                  the configuration in user-config.py. xx should be the
+                  language code.
+
+-family:xyz       Set the family of the wiki you want to work on, e.g.
+                  wikipedia, wiktionary, wikitravel, ...
+                  This will override the configuration in user-config.py.
+
+-daemonize:xyz    Immediately returns control to the terminal and redirects
+                  stdout and stderr to xyz (only use for bots that require
+                  no input from stdin).
+
+-help             Shows this help text.
+
+-log              Enable the logfile. Logs will be stored in the logs
+                  subdirectory.
+
+-log:xyz          Enable the logfile, using xyz as the filename.
+
+-nolog            Disable the logfile (if it is enabled by default).
+
+-putthrottle:n    Set the minimum time (in seconds) the bot will wait between
+-pt:n             saving pages.
+
+-verbose          Have the bot provide additional output that may be useful in
+-v                debugging.
+'''
+    try:
+        exec('import %s as module' % moduleName)
+        helpText = module.__doc__.decode('utf-8')
+        if hasattr(module, 'docuReplacements'):
+            for key, value in module.docuReplacements.iteritems():
+                helpText = helpText.replace(key, value.strip('\n\r'))
+        pywikibot.output(helpText)
+    except:
+        pywikibot.output(u'Sorry, no help available for %s' % moduleName)
+        logging.exception('showHelp:')
+    pywikibot.output(globalHelp)
+
+
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py	2008-12-09 22:38:39 UTC (rev 6135)
+++ branches/rewrite/pywikibot/page.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -12,6 +12,7 @@
 import pywikibot
 from pywikibot import deprecate_arg
 from pywikibot import config
+import pywikibot.site
 import pywikibot.textlib
import htmlentitydefs
@@ -1325,6 +1326,8 @@
             titleWithSortKey = self.title(withSection=False)
         return '[[%s]]' % titleWithSortKey
+    @deprecate_arg("startFrom", None)
+    @deprecate_arg("cacheResults", None)
     def subcategories(self, recurse=False):
         """Iterate all subcategories of the current category.
@@ -1353,7 +1356,8 @@
                     for item in subcat.subcategories(recurse):
                         yield item
-    def articles(self, recurse=False, startFrom=None):
+    @deprecate_arg("startFrom", None)
+    def articles(self, recurse=False):
         """
         Yields all articles in the current category.
@@ -1367,7 +1371,7 @@
         namespaces = [x for x in self.site().namespaces().keys()
                       if x>=0 and x!=14]
         for member in self.site().categorymembers(self,
-                                                     namespaces=namespaces):
+                                                  namespaces=namespaces):
             yield member
         if recurse:
             if not isinstance(recurse, bool) and recurse:
Added: branches/rewrite/pywikibot/pagegenerators.py
===================================================================
--- branches/rewrite/pywikibot/pagegenerators.py	                        (rev 0)
+++ branches/rewrite/pywikibot/pagegenerators.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -0,0 +1,965 @@
+# -*- coding: utf-8  -*-
+"""This module offers a wide variety of page generators. A page generator is an
+object that is iterable (see http://www.python.org/dev/peps/pep-0255/ ) and
+that yields page objects on which other scripts can then work.
+
+In general, there is no need to run this script directly. It can, however,
+be run for testing purposes. It will then print the page titles to standard
+output.
+
+These parameters are supported to specify which pages titles to print:
+
+&params;
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+import pywikibot
+
+import itertools
+import Queue
+import re
+import sys
+import threading
+
+
+# ported from version 1 for backwards-compatibility
+# most of these functions just wrap a Site or Page method that returns
+# a generator
+
+parameterHelp = """\
+-cat              Work on all pages which are in a specific category.
+                  Argument can also be given as "-cat:categoryname" or
+                  as "-cat:categoryname|fromtitle".
+
+-catr             Like -cat, but also recursively includes pages in
+                  subcategories, sub-subcategories etc. of the
+                  given category.
+                  Argument can also be given as "-catr:categoryname" or
+                  as "-catr:categoryname|fromtitle".
+
+-subcats          Work on all subcategories of a specific category.
+                  Argument can also be given as "-subcats:categoryname" or
+                  as "-subcats:categoryname|fromtitle".
+
+-subcatsr         Like -subcats, but also includes sub-subcategories etc. of
+                  the given category.
+                  Argument can also be given as "-subcatsr:categoryname" or
+                  as "-subcatsr:categoryname|fromtitle".
+
+-uncat            Work on all pages which are not categorised.
+
+-uncatcat         Work on all categories which are not categorised.
+
+-uncatfiles       Work on all files which are not categorised.
+
+-file             Read a list of pages to treat from the named text file.
+                  Page titles in the file must be enclosed with [[brackets]].
+                  Argument can also be given as "-file:filename".
+
+-filelinks        Work on all pages that use a certain image/media file.
+                  Argument can also be given as "-filelinks:filename".
+
+-yahoo            Work on all pages that are found in a Yahoo search.
+                  Depends on python module pYsearch.  See yahoo_appid in
+                  config.py for instructions.
+
+-search           Work on all pages that are found in a MediaWiki search
+                  across all namespaces.
+
+-google           Work on all pages that are found in a Google search.
+                  You need a Google Web API license key. Note that Google
+                  doesn't give out license keys anymore. See google_key in
+                  config.py for instructions.
+                  Argument can also be given as "-google:searchstring".
+
+-interwiki        Work on the given page and all equivalent pages in other
+                  languages. This can, for example, be used to fight
+                  multi-site spamming.
+                  Attention: this will cause the bot to modify
+                  pages on several wiki sites, this is not well tested,
+                  so check your edits!
+
+-links            Work on all pages that are linked from a certain page.
+                  Argument can also be given as "-links:linkingpagetitle".
+
+-new              Work on the 60 newest pages. If given as -new:x, will work
+                  on the x newest pages.
+
+-imagelinks       Work on all images that are linked from a certain page.
+                  Argument can also be given as "-imagelinks:linkingpagetitle".
+
+-newimages        Work on the 100 newest images. If given as -newimages:x,
+                  will work on the x newest images.
+
+-ref              Work on all pages that link to a certain page.
+                  Argument can also be given as "-ref:referredpagetitle".
+
+-start            Specifies that the robot should go alphabetically through
+                  all pages on the home wiki, starting at the named page.
+                  Argument can also be given as "-start:pagetitle".
+
+                  You can also include a namespace. For example,
+                  "-start:Template:!" will make the bot work on all pages
+                  in the template namespace.
+
+-prefixindex      Work on pages commencing with a common prefix.
+
+-regex            Obsolete, use -titleregex
+
+-titleregex       Work on titles that match the given regular expression.
+
+-transcludes      Work on all pages that use a certain template.
+                  Argument can also be given as "-transcludes:Template:Title".
+
+-unusedfiles      Work on all description pages of images/media files that are
+                  not used anywhere.
+                  Argument can be given as "-unusedfiles:n" where
+                  n is the maximum number of articles to work on.
+
+-unwatched        Work on all articles that are not watched by anyone.
+                  Argument can be given as "-unwatched:n" where
+                  n is the maximum number of articles to work on.
+
+-usercontribs     Work on all articles that were edited by a certain user :
+                  Example : -usercontribs:DumZiBoT
+
+-weblink          Work on all articles that contain an external link to
+                  a given URL; may be given as "-weblink:url"
+
+-withoutinterwiki Work on all pages that don't have interlanguage links.
+                  Argument can be given as "-withoutinterwiki:n" where
+                  n is some number (??).
+"""
+
+docuReplacements = {'&params;': parameterHelp}
+
+# if a bot uses GeneratorFactory, the module should include the line
+#   docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}
+# and include the marker &params; in the module's docstring
+
+
+class GeneratorFactory(object):
+    """Process command line arguments and return appropriate page generator."""
+
+    def setCategoryGen(self, arg, length, recurse = False):
+        if len(arg) == length:
+            categoryname = pywikibot.input(u'Please enter the category name:')
+        else:
+            categoryname = arg[length + 1:]
+
+        ind = categoryname.find('|')
+        if ind > 0:
+            startfrom = categoryname[ind + 1:]
+            categoryname = categoryname[:ind]
+        else:
+            startfrom = None
+
+        cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
+        return CategorizedPageGenerator(cat, start=startfrom, recurse=recurse)
+
+    def setSubCategoriesGen(self, arg, length, recurse=False):
+        if len(arg) == length:
+            categoryname = pywikibot.input(u'Please enter the category name:')
+        else:
+            categoryname = arg[length + 1:]
+
+        ind = categoryname.find('|')
+        if ind > 0:
+            startfrom = categoryname[ind + 1:]
+            categoryname = categoryname[:ind]
+        else:
+            startfrom = None
+
+        cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
+        return SubCategoriesPageGenerator(cat, start=startfrom, recurse=recurse)
+
+    def handleArg(self, arg):
+        gen = None
+        if arg.startswith('-filelinks'):
+            fileLinksPageTitle = arg[11:]
+            if not fileLinksPageTitle:
+                fileLinksPageTitle = pywikibot.input(
+                    u'Links to which image page should be processed?')
+            if fileLinksPageTitle.startswith(pywikibot.Site().namespace(6)
+                                             + ":"):
+                fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
+                                                    fileLinksPageTitle)
+            else:
+                fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
+                                                    'Image:' +
+                                                    fileLinksPageTitle)
+            gen = FileLinksGenerator(fileLinksPage)
+        elif arg.startswith('-unusedfiles'):
+            if len(arg) == 12:
+                gen = UnusedFilesGenerator()
+            else:
+                gen = UnusedFilesGenerator(number = int(arg[13:]))
+        elif arg.startswith('-unwatched'):
+            if len(arg) == 10:
+                gen = UnwatchedPagesPageGenerator()
+            else:
+                gen = UnwatchedPagesPageGenerator(number = int(arg[11:]))
+        elif arg.startswith('-usercontribs'):
+            gen = UserContributionsGenerator(arg[14:])
+        elif arg.startswith('-withoutinterwiki'):
+            if len(arg) == 17:
+                gen = WithoutInterwikiPageGenerator()
+            else:
+                gen = WithoutInterwikiPageGenerator(number = int(arg[18:]))
+        elif arg.startswith('-interwiki'):
+            title = arg[11:]
+            if not title:
+                title = pywikibot.input(u'Which page should be processed?')
+            page = pywikibot.Page(pywikibot.Site(), title)
+            gen = InterwikiPageGenerator(page)
+        elif arg.startswith('-file'):
+            textfilename = arg[6:]
+            if not textfilename:
+                textfilename = pywikibot.input(
+                    u'Please enter the local file name:')
+            gen = TextfilePageGenerator(textfilename)
+        elif arg.startswith('-catr'):
+            gen = self.setCategoryGen(arg, 5, recurse = True)
+        elif arg.startswith('-cat'):
+            gen = self.setCategoryGen(arg, 4)
+        elif arg.startswith('-subcatsr'):
+            gen = self.setSubCategoriesGen(arg, 9, recurse = True)
+        elif arg.startswith('-subcats'):
+            gen = self.setSubCategoriesGen(arg, 8)
+        elif arg.startswith('-uncatfiles'):
+            gen = UnCategorizedImageGenerator()
+        elif arg.startswith('-uncatcat'):
+            gen = UnCategorizedCategoryGenerator()
+        elif arg.startswith('-uncat'):
+            gen = UnCategorizedPageGenerator()
+        elif arg.startswith('-ref'):
+            referredPageTitle = arg[5:]
+            if not referredPageTitle:
+                referredPageTitle = pywikibot.input(
+                    u'Links to which page should be processed?')
+            referredPage = pywikibot.Page(pywikibot.Site(), referredPageTitle)
+            gen = ReferringPageGenerator(referredPage)
+        elif arg.startswith('-links'):
+            linkingPageTitle = arg[7:]
+            if not linkingPageTitle:
+                linkingPageTitle = pywikibot.input(
+                    u'Links from which page should be processed?')
+            linkingPage = pywikibot.Page(pywikibot.Site(), linkingPageTitle)
+            gen = LinkedPageGenerator(linkingPage)
+        elif arg.startswith('-weblink'):
+            url = arg[9:]
+            if not url:
+                url = pywikibot.input(
+                    u'Pages with which weblink should be processed?')
+            gen = LinksearchPageGenerator(url)
+        elif arg.startswith('-transcludes'):
+            transclusionPageTitle = arg[len('-transcludes:'):]
+            if not transclusionPageTitle:
+                transclusionPageTitle = pywikibot.input(
+                    u'Pages that transclude which page should be processed?')
+            transclusionPage = pywikibot.Page(pywikibot.Site(),
+                                    'Template:%s' % transclusionPageTitle)
+            gen = ReferringPageGenerator(transclusionPage,
+                                         onlyTemplateInclusion=True)
+        elif arg.startswith('-start'):
+            if arg.startswith('-startxml'):
+                pywikibot.output(u'-startxml : wrong parameter')
+                raise ValueError
+            firstPageTitle = arg[7:]
+            if not firstPageTitle:
+                firstPageTitle = pywikibot.input(
+                    u'At which page do you want to start?')
+            namespace = pywikibot.Page(pywikibot.Site(),
+                                       firstPageTitle).namespace()
+            firstPageTitle = pywikibot.Page(pywikibot.link(firstPageTitle)
+                                           ).titleWithoutNamespace()
+            gen = AllpagesPageGenerator(firstPageTitle, namespace,
+                                        includeredirects=False)
+        elif arg.startswith('-prefixindex'):
+            prefix = arg[13:]
+            namespace = None
+            if not prefix:
+                prefix = pywikibot.input(
+                    u'What page names are you looking for?')
+            gen = PrefixingPageGenerator(prefix=prefix)
+        elif arg.startswith('-newimages'):
+            limit = arg[11:] or pywikibot.input(
+                u'How many images do you want to load?')
+            gen = NewimagesPageGenerator(number=int(limit))
+        elif arg.startswith('-new'):
+            if len(arg) >=5:
+              gen = NewpagesPageGenerator(number=int(arg[5:]))
+            else:
+              gen = NewpagesPageGenerator(number=60)
+        elif arg.startswith('-imagelinks'):
+            imagelinkstitle = arg[len('-imagelinks:'):]
+            if not imagelinkstitle:
+                imagelinkstitle = pywikibot.input(
+                    u'Images on which page should be processed?')
+            imagelinksPage = pywikibot.Page(pywikibot.Link(imagelinkstitle))
+            gen = ImagesPageGenerator(imagelinksPage)
+        elif arg.startswith('-search'):
+            mediawikiQuery = arg[8:]
+            if not mediawikiQuery:
+                mediawikiQuery = pywikibot.input(
+                    u'What do you want to search for?')
+            # In order to be useful, all namespaces are required
+            gen = SearchPageGenerator(mediawikiQuery, namespaces = [])
+        elif arg.startswith('-google'):
+            gen = GoogleSearchPageGenerator(arg[8:])
+        elif arg.startswith('-titleregex'):
+            if len(arg) == 6:
+                regex = pywikibot.input(
+                    u'What page names are you looking for?')
+            else:
+                regex = arg[7:]
+            gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex)
+        elif arg.startswith('-yahoo'):
+            gen = YahooSearchPageGenerator(arg[7:])
+        else:
+            return None
+        # make sure all yielded pages are unique
+        gen = DuplicateFilterPageGenerator(gen)
+        return gen
+
+
+class ThreadedGenerator(threading.Thread):
+    """Look-ahead generator class.
+
+    Runs a generator in a separate thread and queues the results; can
+    be called like a regular generator.
+
+    Subclasses should override self.generator, _not_ self.run
+
+    Important: the generator thread will stop itself if the generator's
+    internal queue is exhausted; but, if the calling program does not use
+    all the generated values, it must call the generator's stop() method to
+    stop the background thread.  Example usage:
+
+    >>> gen = ThreadedGenerator(target=foo)
+    >>> try:
+    ...     for data in gen:
+    ...         do_work(data)
+    ... finally:
+    ...     gen.stop()
+
+    """ #NOT CURRENTLY USED: Intended for future development
+
+    def __init__(self, group=None, target=None, name="GeneratorThread",
+                 args=(), kwargs=None, qsize=65536):
+        """Constructor.  Takes same keyword arguments as threading.Thread.
+
+        target must be a generator function (or other callable that returns
+        an iterable object).
+
+        @param qsize: The size of the lookahead queue. The larger the qsize,
+        the more values will be computed in advance of use (which can eat
+        up memory and processor time).
+        @type qsize: int
+
+        """
+        if kwargs is None:
+            kwargs = {}
+        if target:
+            self.generator = target
+        if not hasattr(self, "generator"):
+            raise RuntimeError("No generator for ThreadedGenerator to run.")
+        self.args, self.kwargs = args, kwargs
+        threading.Thread.__init__(self, group=group, name=name)
+        self.queue = Queue.Queue(qsize)
+        self.finished = threading.Event()
+
+    def __iter__(self):
+        """Iterate results from the queue."""
+        if not self.isAlive() and not self.finished.isSet():
+            self.start()
+        # if there is an item in the queue, yield it, otherwise wait
+        while not self.finished.isSet():
+            try:
+                yield self.queue.get(True, 0.25)
+            except Queue.Empty:
+                pass
+            except KeyboardInterrupt:
+                self.stop()
+
+    def stop(self):
+        """Stop the background thread."""
+##        if not self.finished.isSet():
+##            pywikibot.output("DEBUG: signalling %s to stop." % self)
+        self.finished.set()
+
+    def run(self):
+        """Run the generator and store the results on the queue."""
+        self.__gen = self.generator(*self.args, **self.kwargs)
+        for result in self.__gen:
+            while True:
+                if self.finished.isSet():
+##                    pywikibot.output("DEBUG: %s received stop signal." % self)
+                    return
+                try:
+                    self.queue.put_nowait(result)
+                except Queue.Full:
+                    time.sleep(0.25)
+                    continue
+                break
+        # wait for queue to be emptied, then kill the thread
+        while not self.finished.isSet() and not self.queue.empty():
+            time.sleep(0.25)
+        self.stop()
+##        pywikibot.output("DEBUG: %s stopped because generator exhausted." % self)
+
+
+def AllpagesPageGenerator(start ='!', namespace=None, includeredirects=True,
+                          site=None):
+    """
+    Using the Allpages special page, retrieve all articles' titles, and yield
+    page objects.
+    If includeredirects is False, redirects are not included. If
+    includeredirects equals the string 'only', only redirects are added.
+    """
+    if site is None:
+        site = pywikibot.getSite()
+    if includeredirects:
+        if includeredirects == 'only':
+            filterredir = True
+        else:
+            filterredir = None
+    else:
+        filterredir = False
+    return site.allpages(start=start, namespace=namespace,
+                         filterredir=filterredir)
+
+
+def PrefixingPageGenerator(prefix, namespace=None, includeredirects=True,
+                           site=None):
+    if site is None:
+        site = pywikibot.Site()
+    page = pywikibot.Page(site, prefix)
+    if namespace is None:
+        namespace = page.namespace()
+    title = page.titleWithoutNamespace()
+    if includeredirects:
+        if includeredirects == 'only':
+            filterredir = True
+        else:
+            filterredir = None
+    else:
+        filterredir = False
+    return site.allpages(prefix=title, namespace=namespace,
+                         filterredir=filterredir)
+
+
+def NewpagesPageGenerator(number=100, get_redirect=False, repeat=False,
+                          site=None):
+    # API does not (yet) have a newpages function, so this tries to duplicate
+    # it by filtering the recentchanges output
+    # defaults to namespace 0 because that's how Special:Newpages defaults
+    if site is None:
+        site = pywikibot.Site()
+    return site.recentchanges(limit=number, showredirects=get_redirect,
+                              changetype="new", namespaces=0)
+
+
+def FileLinksGenerator(referredImagePage):
+    return referredImagePage.usingPages()
+
+
+def ImagesPageGenerator(pageWithImages):
+    return pageWithImages.imagelinks()
+
+
+def InterwikiPageGenerator(page):
+    """Iterator over all interwiki (non-language) links on a page."""
+    for link in page.interwiki():
+        yield pywikibot.Page(link)
+
+
+def LanguageLinksPageGenerator(page):
+    """Iterator over all interwiki language links on a page."""
+    for link in page.langlinks():
+        yield pywikibot.Page(link)
+
+
+def ReferringPageGenerator(referredPage, followRedirects=False,
+                           withTemplateInclusion=True,
+                           onlyTemplateInclusion=False):
+    '''Yields all pages referring to a specific page.'''
+    return referredPage.getReferences(
+                follow_redirects=followRedirects,
+                withTemplateInclusion=withTemplateInclusion,
+                onlyTemplateInclusion=onlyTemplateInclusion)
+
+
+def CategorizedPageGenerator(category, recurse=False, start=None):
+    '''Yield all pages in a specific category.
+
+    If recurse is True, pages in subcategories are included as well; if
+    recurse is an int, only subcategories to that depth will be included
+    (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
+    not go any further).
+    If start is a string value, only pages whose sortkey comes after start
+    alphabetically are included.
+
+    ''' # TODO: page generator could be modified to use cmstartsortkey ...
+    for a in category.articles(recurse=recurse):
+        if start is None or a.title(withNamespace=False) >= start:
+            yield a
+
+
+def SubCategoriesPageGenerator(category, recurse=False, start=None):
+    '''Yields all subcategories in a specific category.
+
+    If recurse is True, pages in subcategories are included as well; if
+    recurse is an int, only subcategories to that depth will be included
+    (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
+    not go any further).
+    If start is a string value, only categories whose sortkey comes after
+    start alphabetically are included.
+
+    ''' # TODO: page generator could be modified to use cmstartsortkey ...
+    for s in category.subcategories(recurse=recurse):
+        if start is None or s.title(withNamespace=False) >= start:
+            yield s
+
+
+def LinkedPageGenerator(linkingPage):
+    """Yields all pages linked from a specific page."""
+    return linkingPage.linkedPages()
+
+
+def TextfilePageGenerator(filename=None, site=None):
+    """Iterate pages from a list in a text file.
+
+    The file must contain page links between double-square-brackets.  The
+    generator will yield each corresponding Page object.
+
+    @param filename: the name of the file that should be read. If no name is
+        given, the generator prompts the user.
+    @param site: the default Site for which Page objects should be created
+
+    """
+    if filename is None:
+        filename = pywikibot.input(u'Please enter the filename:')
+    if site is None:
+        site = pywikibot.Site()
+    f = codecs.open(filename, 'r', config.textfile_encoding)
+    for linkmatch in Rlink.finditer(f.read()):
+        # If the link is in interwiki format, the Page object may reside
+        # on a different Site than the default.
+        # This makes it possible to work on different wikis using a single
+        # text file, but also could be dangerous because you might
+        # inadvertently change pages on another wiki!
+        yield pywikibot.Page(pywikibot.Link(linkmatch.groups("title"), site))
+    f.close()
+
+
+def PagesFromTitlesGenerator(iterable, site=None):
+    """Generate pages from the titles (unicode strings) yielded by iterable."""
+    if site is None:
+        site = pywikibot.Site()
+    for title in iterable:
+        if not isinstance(title, basestring):
+            break
+        yield pywikibot.Page(pywikibot.Link(title, site))
+
+
+def UserContributionsGenerator(username, number=250, namespaces=None,
+                               site=None):
+    """Yields number unique pages edited by user:username
+    namespaces : list of namespace numbers to fetch contribs from
+
+    """
+    if site is None:
+        site = pywikibot.Site()
+    return site.usercontribs(user=username, limit=number, namespaces=namespaces)
+
+
+def NamespaceFilterPageGenerator(generator, namespaces, site=None):
+    """
+    Wraps around another generator. Yields only those pages that are in one
+    of the given namespaces.
+
+    The namespace list can contain both integers (namespace numbers) and
+    strings/unicode strings (namespace names).
+
+    """
+    if site is None:
+        site = pywikibot.Site()
+    # convert namespace names to namespace numbers
+    for i in xrange(len(namespaces)):
+        ns = namespaces[i]
+        if isinstance(ns, basestring):
+            index = site.getNamespaceIndex(ns)
+            if index is None:
+                raise ValueError(u'Unknown namespace: %s' % ns)
+            namespaces[i] = index
+    for page in generator:
+        if page.namespace() in namespaces:
+            yield page
+
+
+def RedirectFilterPageGenerator(generator):
+    """Yields pages from another generator that are not redirects."""
+    for page in generator:
+        if not page.isRedirectPage():
+            yield page
+
+
+def DuplicateFilterPageGenerator(generator):
+    """Yield all unique pages from another generator, omitting duplicates."""
+    seenPages = {}
+    for page in generator:
+        if page not in seenPages:
+            seenPages[page] = None
+            yield page
+
+
+def RegexFilterPageGenerator(generator, regex):
+    """Yield pages from another generator whose titles match regex."""
+    reg = re.compile(regex, re.I)
+    for page in generator:
+        if reg.match(page.titleWithoutNamespace()):
+            yield page
+
+
+def CombinedPageGenerator(generators):
+    return itertools.chain(*generators)
+
+
+def CategoryGenerator(generator):
+    """Yield pages from another generator as Category objects.
+
+    Makes sense only if it is ascertained that only categories are being
+    retrieved.
+
+    """
+    for page in generator:
+        yield pywikibot.Category(page)
+
+
+def PageWithTalkPageGenerator(generator):
+    """
+    Wraps around another generator. Yields the same pages, but for non-talk
+    pages, it also includes associated talk pages.
+    This generator does not check if the talk page in fact exists.
+    """
+    for page in generator:
+        yield page
+        if not page.isTalkPage():
+            yield page.toggleTalkPage()
+
+
+def PreloadingGenerator(self, generator, pageNumber=60, lookahead=10):
+    """Yield preloaded pages taken from another generator."""
+
+    # pages may be on more than one site, for example if an interwiki
+    # generator is used, so use a separate preloader for each site
+    sites = {}
+    # build a list of pages for each site found in the iterator
+    for page in generator:
+        sites.setdefault(page.site(), []).append(page)
+    return itertools.chain(site.preloadpages(sites[site], pageNumber)
+                           for site in sites)
+
+
+#TODO below
+
+def UnusedFilesGenerator(number=100, repeat=False, site=None, extension=None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.unusedfiles(number=number, repeat=repeat,
+                                 extension=extension):
+        yield pywikibot.ImagePage(page.site(), page.title())
+
+def WithoutInterwikiPageGenerator(number=100, repeat=False, site=None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.withoutinterwiki(number=number, repeat=repeat):
+        yield page
+
+def UnCategorizedCategoryGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.uncategorizedcategories(number=number, repeat=repeat):
+        yield page
+
+def UnCategorizedImageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.uncategorizedimages(number=number, repeat=repeat):
+        yield page
+
+def NewimagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.newimages(number, repeat=repeat):
+        yield page[0]
+
+def UnCategorizedPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.uncategorizedpages(number=number, repeat=repeat):
+        yield page
+
+def LonelyPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.lonelypages(number=number, repeat=repeat):
+        yield page
+
+def UnwatchedPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.unwatchedpages(number=number, repeat=repeat):
+        yield page
+
+def AncientPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.ancientpages(number=number, repeat=repeat):
+        yield page[0]
+
+def DeadendPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.deadendpages(number=number, repeat=repeat):
+        yield page
+
+def LongPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.longpages(number=number, repeat=repeat):
+        yield page[0]
+
+def ShortPagesPageGenerator(number = 100, repeat = False, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.shortpages(number=number, repeat=repeat):
+        yield page[0]
+
+def LinksearchPageGenerator(link, step=500, site=None):
+    """Yields all pages that include a specified link, according to
+    [[Special:Linksearch]].
+
+    """
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.linksearch(link, limit=step):
+        yield page
+
+def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
+    """
+    Provides a list of results using the internal MediaWiki search engine
+    """
+    if site is None:
+        site = pywikibot.Site()
+    for page in site.search(query, number=number, namespaces = namespaces):
+        yield page[0]
+
+class YahooSearchPageGenerator:
+    '''
+    To use this generator, install pYsearch
+    '''
+    def __init__(self, query = None, count = 100, site = None): # values larger than 100 fail
+        self.query = query or pywikibot.input(u'Please enter the search query:')
+        self.count = count
+        if site is None:
+            site = pywikibot.Site()
+        self.site = site
+
+    def queryYahoo(self, query):
+       from yahoo.search.web import WebSearch
+       srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
+
+       dom = srch.get_results()
+       results = srch.parse_results(dom)
+       for res in results:
+           url = res.Url
+           yield url
+
+    def __iter__(self):
+        # restrict query to local site
+        localQuery = '%s site:%s' % (self.query, self.site.hostname())
+        base = 'http://%s%s' % (self.site.hostname(), self.site.nice_get_address(''))
+        for url in self.queryYahoo(localQuery):
+            if url[:len(base)] == base:
+                title = url[len(base):]
+                page = pywikibot.Page(self.site, title)
+                yield page
+
+class GoogleSearchPageGenerator:
+    '''
+    To use this generator, you must install the pyGoogle module from
+    http://pygoogle.sf.net/ and get a Google Web API license key from
+    http://www.google.com/apis/index.html . The google_key must be set to your
+    license key in your configuration.
+    '''
+    def __init__(self, query = None, site = None):
+        self.query = query or pywikibot.input(u'Please enter the search query:')
+        if site is None:
+            site = pywikibot.Site()
+        self.site = site
+
+    #########
+    # partially commented out because it is probably not in compliance with Google's "Terms of
+    # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US)
+    def queryGoogle(self, query):
+        #if config.google_key:
+        if True:
+            #try:
+                for url in self.queryViaSoapApi(query):
+                    yield url
+                return
+            #except ImportError:
+                #pass
+        # No google license key, or pygoogle not installed. Do it the ugly way.
+        #for url in self.queryViaWeb(query):
+        #    yield url
+
+    def queryViaSoapApi(self, query):
+        import google
+        google.LICENSE_KEY = config.google_key
+        offset = 0
+        estimatedTotalResultsCount = None
+        while not estimatedTotalResultsCount or offset < estimatedTotalResultsCount:
+            while (True):
+                # Google often yields 502 errors.
+                try:
+                    pywikibot.output(u'Querying Google, offset %i' % offset)
+                    data = google.doGoogleSearch(query, start = offset, filter = False)
+                    break
+                except KeyboardInterrupt:
+                    raise
+                except:
+                    # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway)
+                    # can happen here, depending on the module used. It's not easy
+                    # to catch this properly because pygoogle decides which one of
+                    # the soap modules to use.
+                    pywikibot.output(u"An error occured. Retrying in 10 seconds...")
+                    time.sleep(10)
+                    continue
+
+            for result in data.results:
+                #print 'DBG: ', result.URL
+                yield result.URL
+            # give an estimate of pages to work on, but only once.
+            if not estimatedTotalResultsCount:
+                pywikibot.output(u'Estimated total result count: %i pages.' % data.meta.estimatedTotalResultsCount)
+            estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount
+            #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount
+            offset += 10
+
+    #########
+    # commented out because it is probably not in compliance with Google's "Terms of
+    # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US)
+
+    #def queryViaWeb(self, query):
+        #"""
+        #Google has stopped giving out API license keys, and sooner or later
+        #they will probably shut down the service.
+        #This is a quick and ugly solution: we just grab the search results from
+        #the normal web interface.
+        #"""
+        #linkR = re.compile(r'<a href="([^>"]+?)" class=l>', re.IGNORECASE)
+        #offset = 0
+
+        #while True:
+            #pywikibot.output("Google: Querying page %d" % (offset / 100 + 1))
+            #address = "http://www.google.com/search?q=%s&num=100&hl=en&start=%d" % (urllib.quote_plus(query), offset)
+            ## we fake being Firefox because Google blocks unknown browsers
+            #request = urllib2.Request(address, None, {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1 Firefox/1.5'})
+            #urlfile = urllib2.urlopen(request)
+            #page = urlfile.read()
+            #urlfile.close()
+            #for url in linkR.findall(page):
+                #yield url
+            #if "<div id=nn>" in page: # Is there a "Next" link for next page of results?
+                #offset += 100  # Yes, go to next page of results.
+            #else:
+                #return
+    #########
+
+    def __iter__(self):
+        # restrict query to local site
+        localQuery = '%s site:%s' % (self.query, self.site.hostname())
+        base = 'http://%s%s' % (self.site.hostname(), self.site.nice_get_address(''))
+        for url in self.queryGoogle(localQuery):
+            if url[:len(base)] == base:
+                title = url[len(base):]
+                page = pywikibot.Page(self.site, title)
+                # Google contains links in the format http://de.wikipedia.org/wiki/en:Foobar
+                if page.site() == self.site:
+                    yield page
+
+def MySQLPageGenerator(query, site = None):
+    import MySQLdb as mysqldb
+    if site is None:
+        site = pywikibot.Site()
+    conn = mysqldb.connect(config.db_hostname, db = site.dbName(),
+                           user = config.db_username,
+                           passwd = config.db_password)
+    cursor = conn.cursor()
+    pywikibot.output(u'Executing query:\n%s' % query)
+    query = query.encode(site.encoding())
+    cursor.execute(query)
+    while True:
+        try:
+            namespaceNumber, pageName = cursor.fetchone()
+            print namespaceNumber, pageName
+        except TypeError:
+            # Limit reached or no more results
+            break
+        #print pageName
+        if pageName:
+            namespace = site.namespace(namespaceNumber)
+            pageName = unicode(pageName, site.encoding())
+            if namespace:
+                pageTitle = '%s:%s' % (namespace, pageName)
+            else:
+                pageTitle = pageName
+            page = pywikibot.Page(site, pageTitle)
+            yield page
+
+def YearPageGenerator(start = 1, end = 2050, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    pywikibot.output(u"Starting with year %i" % start)
+    for i in xrange(start, end + 1):
+        if i % 100 == 0:
+            pywikibot.output(u'Preparing %i...' % i)
+        # There is no year 0
+        if i != 0:
+            current_year = date.formatYear(site.lang, i )
+            yield pywikibot.Page(site, current_year)
+
+def DayPageGenerator(startMonth = 1, endMonth = 12, site = None):
+    if site is None:
+        site = pywikibot.Site()
+    fd = date.FormatDate(site)
+    firstPage = pywikibot.Page(site, fd(startMonth, 1))
+    pywikibot.output(u"Starting with %s" % firstPage.aslink())
+    for month in xrange(startMonth, endMonth+1):
+        for day in xrange(1, date.getNumberOfDaysInMonth(month)+1):
+            yield pywikibot.Page(site, fd(month, day))
+
+
+if __name__ == "__main__":
+    try:
+        gen = None
+        genFactory = GeneratorFactory()
+        for arg in pywikibot.handleArgs():
+            generator = genFactory.handleArg(arg)
+            if generator:
+                gen = generator
+        if gen:
+            for page in gen:
+                pywikibot.output(page.title(), toStdout = True)
+        else:
+            pywikibot.showHelp()
+    finally:
+        pywikibot.stopme()
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py	2008-12-09 22:38:39 UTC (rev 6135)
+++ branches/rewrite/pywikibot/site.py	2008-12-10 18:56:59 UTC (rev 6136)
@@ -1,4 +1,4 @@
-# -*- coding: utf-8  -*-
+ # -*- coding: utf-8  -*-
 """
 Objects representing MediaWiki sites (wikis) and families (groups of wikis
 on the same topic in different languages).

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

[Pywikipedia-l] SVN: [6136] branches/rewrite/pywikibot