Revision: 6136
Author: russblau
Date: 2008-12-10 18:56:59 +0000 (Wed, 10 Dec 2008)
Log Message:
-----------
Handling of command-line arguments
Modified Paths:
--------------
branches/rewrite/pywikibot/__init__.py
branches/rewrite/pywikibot/page.py
branches/rewrite/pywikibot/site.py
Added Paths:
-----------
branches/rewrite/pywikibot/bot.py
branches/rewrite/pywikibot/pagegenerators.py
Modified: branches/rewrite/pywikibot/__init__.py
===================================================================
--- branches/rewrite/pywikibot/__init__.py 2008-12-09 22:38:39 UTC (rev 6135)
+++ branches/rewrite/pywikibot/__init__.py 2008-12-10 18:56:59 UTC (rev 6136)
@@ -11,10 +11,12 @@
import sys
import logging
+import re
from exceptions import *
import config2 as config
import textlib
+from bot import handleArgs, showHelp
def deprecate_arg(old_arg, new_arg):
@@ -97,10 +99,13 @@
from page import Page, ImagePage, Category, Link
+link_regex = re.compile(r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]')
+
+
# User interface functions (kept extremely simple for debugging)
-def output(text):
- print text
+def output(text, toStdout=False):
+ print text.encode(config.console_encoding, "xmlcharrefreplace")
def input(prompt, password=False):
if isinstance(prompt, unicode):
Added: branches/rewrite/pywikibot/bot.py
===================================================================
--- branches/rewrite/pywikibot/bot.py (rev 0)
+++ branches/rewrite/pywikibot/bot.py 2008-12-10 18:56:59 UTC (rev 6136)
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+"""
+User-interface related functions for building bots
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+# Note: the intention is to develop this module (at some point) into a Bot
+# class definition that can be subclassed to create new, functional bot
+# scripts, instead of writing each one from scratch.
+
+
+import os.path
+import sys
+import pywikibot
+from pywikibot import config2 as config
+
+
+def calledModuleName():
+ """Return the name of the module calling this function.
+
+ This is required because the -help option loads the module's docstring
+ and because the module name will be used for the filename of the log.
+
+ """
+ # get commandline arguments
+ called = sys.argv[0].strip()
+ if ".py" in called: # could end with .pyc, .pyw, etc. on some platforms
+ called = called[ : called.rindex(".py")]
+ return os.path.basename(called)
+
+
+def _decodeArg(arg):
+ if sys.platform=='win32':
+ if config.console_encoding == 'cp850':
+ # Western Windows versions give parameters encoded as windows-1252
+ # even though the console encoding is cp850.
+ return unicode(arg, 'windows-1252')
+ elif config.console_encoding == 'cp852':
+ # Central/Eastern European Windows versions give parameters encoded
+ # as windows-1250 even though the console encoding is cp852.
+ return unicode(arg, 'windows-1250')
+ else:
+ return unicode(arg, config.console_encoding)
+ else:
+ # Linux uses the same encoding for both.
+ # I don't know how non-Western Windows versions behave.
+ return unicode(arg, config.console_encoding)
+
+
+def handleArgs(*args):
+ """Handle standard command line arguments, return the rest as a list.
+
+ Takes the commandline arguments, converts them to Unicode, processes all
+ global parameters such as -lang or -log. Returns a list of all arguments
+ that are not global. This makes sure that global arguments are applied
+ first, regardless of the order in which the arguments were given.
+
+ args may be passed as an argument, thereby overriding sys.argv
+
+ """
+ global verbose
+ # get commandline arguments if necessary
+ if not args:
+ args = sys.argv[1:]
+ # get the name of the module calling this function. This is
+ # required because the -help option loads the module's docstring and because
+ # the module name will be used for the filename of the log.
+ moduleName = calledModuleName()
+ nonGlobalArgs = []
+ for arg in args:
+ arg = _decodeArg(arg)
+ if arg == '-help':
+ showHelp(moduleName)
+ sys.exit(0)
+ elif arg.startswith('-family:'):
+ config.family = arg[8:]
+ elif arg.startswith('-lang:'):
+ config.code = arg[6:]
+ elif arg.startswith('-putthrottle:'):
+ config.put_throttle = int(arg[13:])
+ elif arg.startswith('-pt:'):
+ config.put_throttle = int(arg[4:])
+ elif arg == '-log':
+ setLogfileStatus(True) #FIXME
+ elif arg.startswith('-log:'):
+ setLogfileStatus(True, arg[5:]) #FIXME
+ elif arg == '-nolog':
+ setLogfileStatus(False) #FIXME
+ elif arg == '-verbose' or arg == "-v":
+ pywikibot.output(u'Pywikipediabot %s' % (version.getversion()))
+ pywikibot.output(u'Python %s' % (sys.version))
+ verbose += 1 # FIXME
+ elif arg == '-daemonize':
+ import daemonize
+ daemonize.daemonize()
+ elif arg.startswith('-daemonize:'):
+ import daemonize
+ daemonize.daemonize(redirect_std = arg[11:])
+ else:
+ # the argument is not global. Let the specific bot script care
+ # about it.
+ nonGlobalArgs.append(arg)
+ return nonGlobalArgs
+
+
+def showHelp():
+ moduleName = calledModuleName()
+ globalHelp =u'''\
+Global arguments available for all bots:
+
+-dir:PATH Read the bot's configuration data from directory given by
+ PATH, instead of from the default directory.
+
+-lang:xx Set the language of the wiki you want to work on, overriding
+ the configuration in user-config.py. xx should be the
+ language code.
+
+-family:xyz Set the family of the wiki you want to work on, e.g.
+ wikipedia, wiktionary, wikitravel, ...
+ This will override the configuration in user-config.py.
+
+-daemonize:xyz Immediately returns control to the terminal and redirects
+ stdout and stderr to xyz (only use for bots that require
+ no input from stdin).
+
+-help Shows this help text.
+
+-log Enable the logfile. Logs will be stored in the logs
+ subdirectory.
+
+-log:xyz Enable the logfile, using xyz as the filename.
+
+-nolog Disable the logfile (if it is enabled by default).
+
+-putthrottle:n Set the minimum time (in seconds) the bot will wait between
+-pt:n saving pages.
+
+-verbose Have the bot provide additional output that may be useful in
+-v debugging.
+'''
+ try:
+ exec('import %s as module' % moduleName)
+ helpText = module.__doc__.decode('utf-8')
+ if hasattr(module, 'docuReplacements'):
+ for key, value in module.docuReplacements.iteritems():
+ helpText = helpText.replace(key, value.strip('\n\r'))
+ pywikibot.output(helpText)
+ except:
+ pywikibot.output(u'Sorry, no help available for %s' % moduleName)
+ logging.exception('showHelp:')
+ pywikibot.output(globalHelp)
+
+
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py 2008-12-09 22:38:39 UTC (rev 6135)
+++ branches/rewrite/pywikibot/page.py 2008-12-10 18:56:59 UTC (rev 6136)
@@ -12,6 +12,7 @@
import pywikibot
from pywikibot import deprecate_arg
from pywikibot import config
+import pywikibot.site
import pywikibot.textlib
import htmlentitydefs
@@ -1325,6 +1326,8 @@
titleWithSortKey = self.title(withSection=False)
return '[[%s]]' % titleWithSortKey
+ @deprecate_arg("startFrom", None)
+ @deprecate_arg("cacheResults", None)
def subcategories(self, recurse=False):
"""Iterate all subcategories of the current category.
@@ -1353,7 +1356,8 @@
for item in subcat.subcategories(recurse):
yield item
- def articles(self, recurse=False, startFrom=None):
+ @deprecate_arg("startFrom", None)
+ def articles(self, recurse=False):
"""
Yields all articles in the current category.
@@ -1367,7 +1371,7 @@
namespaces = [x for x in self.site().namespaces().keys()
if x>=0 and x!=14]
for member in self.site().categorymembers(self,
- namespaces=namespaces):
+ namespaces=namespaces):
yield member
if recurse:
if not isinstance(recurse, bool) and recurse:
Added: branches/rewrite/pywikibot/pagegenerators.py
===================================================================
--- branches/rewrite/pywikibot/pagegenerators.py (rev 0)
+++ branches/rewrite/pywikibot/pagegenerators.py 2008-12-10 18:56:59 UTC (rev 6136)
@@ -0,0 +1,965 @@
+# -*- coding: utf-8 -*-
+"""This module offers a wide variety of page generators. A page generator
is an
+object that is iterable (see
http://www.python.org/dev/peps/pep-0255/ ) and
+that yields page objects on which other scripts can then work.
+
+In general, there is no need to run this script directly. It can, however,
+be run for testing purposes. It will then print the page titles to standard
+output.
+
+These parameters are supported to specify which pages titles to print:
+
+¶ms;
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+import pywikibot
+
+import itertools
+import Queue
+import re
+import sys
+import threading
+
+
+# ported from version 1 for backwards-compatibility
+# most of these functions just wrap a Site or Page method that returns
+# a generator
+
+parameterHelp = """\
+-cat Work on all pages which are in a specific category.
+ Argument can also be given as "-cat:categoryname" or
+ as "-cat:categoryname|fromtitle".
+
+-catr Like -cat, but also recursively includes pages in
+ subcategories, sub-subcategories etc. of the
+ given category.
+ Argument can also be given as "-catr:categoryname" or
+ as "-catr:categoryname|fromtitle".
+
+-subcats Work on all subcategories of a specific category.
+ Argument can also be given as "-subcats:categoryname" or
+ as "-subcats:categoryname|fromtitle".
+
+-subcatsr Like -subcats, but also includes sub-subcategories etc. of
+ the given category.
+ Argument can also be given as "-subcatsr:categoryname" or
+ as "-subcatsr:categoryname|fromtitle".
+
+-uncat Work on all pages which are not categorised.
+
+-uncatcat Work on all categories which are not categorised.
+
+-uncatfiles Work on all files which are not categorised.
+
+-file Read a list of pages to treat from the named text file.
+ Page titles in the file must be enclosed with [[brackets]].
+ Argument can also be given as "-file:filename".
+
+-filelinks Work on all pages that use a certain image/media file.
+ Argument can also be given as "-filelinks:filename".
+
+-yahoo Work on all pages that are found in a Yahoo search.
+ Depends on python module pYsearch. See yahoo_appid in
+ config.py for instructions.
+
+-search Work on all pages that are found in a MediaWiki search
+ across all namespaces.
+
+-google Work on all pages that are found in a Google search.
+ You need a Google Web API license key. Note that Google
+ doesn't give out license keys anymore. See google_key in
+ config.py for instructions.
+ Argument can also be given as "-google:searchstring".
+
+-interwiki Work on the given page and all equivalent pages in other
+ languages. This can, for example, be used to fight
+ multi-site spamming.
+ Attention: this will cause the bot to modify
+ pages on several wiki sites, this is not well tested,
+ so check your edits!
+
+-links Work on all pages that are linked from a certain page.
+ Argument can also be given as "-links:linkingpagetitle".
+
+-new Work on the 60 newest pages. If given as -new:x, will work
+ on the x newest pages.
+
+-imagelinks Work on all images that are linked from a certain page.
+ Argument can also be given as
"-imagelinks:linkingpagetitle".
+
+-newimages Work on the 100 newest images. If given as -newimages:x,
+ will work on the x newest images.
+
+-ref Work on all pages that link to a certain page.
+ Argument can also be given as "-ref:referredpagetitle".
+
+-start Specifies that the robot should go alphabetically through
+ all pages on the home wiki, starting at the named page.
+ Argument can also be given as "-start:pagetitle".
+
+ You can also include a namespace. For example,
+ "-start:Template:!" will make the bot work on all pages
+ in the template namespace.
+
+-prefixindex Work on pages commencing with a common prefix.
+
+-regex Obsolete, use -titleregex
+
+-titleregex Work on titles that match the given regular expression.
+
+-transcludes Work on all pages that use a certain template.
+ Argument can also be given as "-transcludes:Template:Title".
+
+-unusedfiles Work on all description pages of images/media files that are
+ not used anywhere.
+ Argument can be given as "-unusedfiles:n" where
+ n is the maximum number of articles to work on.
+
+-unwatched Work on all articles that are not watched by anyone.
+ Argument can be given as "-unwatched:n" where
+ n is the maximum number of articles to work on.
+
+-usercontribs Work on all articles that were edited by a certain user :
+ Example : -usercontribs:DumZiBoT
+
+-weblink Work on all articles that contain an external link to
+ a given URL; may be given as "-weblink:url"
+
+-withoutinterwiki Work on all pages that don't have interlanguage links.
+ Argument can be given as "-withoutinterwiki:n" where
+ n is some number (??).
+"""
+
+docuReplacements = {'¶ms;': parameterHelp}
+
+# if a bot uses GeneratorFactory, the module should include the line
+# docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp}
+# and include the marker ¶ms; in the module's docstring
+
+
+class GeneratorFactory(object):
+ """Process command line arguments and return appropriate page
generator."""
+
+ def setCategoryGen(self, arg, length, recurse = False):
+ if len(arg) == length:
+ categoryname = pywikibot.input(u'Please enter the category name:')
+ else:
+ categoryname = arg[length + 1:]
+
+ ind = categoryname.find('|')
+ if ind > 0:
+ startfrom = categoryname[ind + 1:]
+ categoryname = categoryname[:ind]
+ else:
+ startfrom = None
+
+ cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
+ return CategorizedPageGenerator(cat, start=startfrom, recurse=recurse)
+
+ def setSubCategoriesGen(self, arg, length, recurse=False):
+ if len(arg) == length:
+ categoryname = pywikibot.input(u'Please enter the category name:')
+ else:
+ categoryname = arg[length + 1:]
+
+ ind = categoryname.find('|')
+ if ind > 0:
+ startfrom = categoryname[ind + 1:]
+ categoryname = categoryname[:ind]
+ else:
+ startfrom = None
+
+ cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
+ return SubCategoriesPageGenerator(cat, start=startfrom, recurse=recurse)
+
+ def handleArg(self, arg):
+ gen = None
+ if arg.startswith('-filelinks'):
+ fileLinksPageTitle = arg[11:]
+ if not fileLinksPageTitle:
+ fileLinksPageTitle = pywikibot.input(
+ u'Links to which image page should be processed?')
+ if fileLinksPageTitle.startswith(pywikibot.Site().namespace(6)
+ + ":"):
+ fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
+ fileLinksPageTitle)
+ else:
+ fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
+ 'Image:' +
+ fileLinksPageTitle)
+ gen = FileLinksGenerator(fileLinksPage)
+ elif arg.startswith('-unusedfiles'):
+ if len(arg) == 12:
+ gen = UnusedFilesGenerator()
+ else:
+ gen = UnusedFilesGenerator(number = int(arg[13:]))
+ elif arg.startswith('-unwatched'):
+ if len(arg) == 10:
+ gen = UnwatchedPagesPageGenerator()
+ else:
+ gen = UnwatchedPagesPageGenerator(number = int(arg[11:]))
+ elif arg.startswith('-usercontribs'):
+ gen = UserContributionsGenerator(arg[14:])
+ elif arg.startswith('-withoutinterwiki'):
+ if len(arg) == 17:
+ gen = WithoutInterwikiPageGenerator()
+ else:
+ gen = WithoutInterwikiPageGenerator(number = int(arg[18:]))
+ elif arg.startswith('-interwiki'):
+ title = arg[11:]
+ if not title:
+ title = pywikibot.input(u'Which page should be processed?')
+ page = pywikibot.Page(pywikibot.Site(), title)
+ gen = InterwikiPageGenerator(page)
+ elif arg.startswith('-file'):
+ textfilename = arg[6:]
+ if not textfilename:
+ textfilename = pywikibot.input(
+ u'Please enter the local file name:')
+ gen = TextfilePageGenerator(textfilename)
+ elif arg.startswith('-catr'):
+ gen = self.setCategoryGen(arg, 5, recurse = True)
+ elif arg.startswith('-cat'):
+ gen = self.setCategoryGen(arg, 4)
+ elif arg.startswith('-subcatsr'):
+ gen = self.setSubCategoriesGen(arg, 9, recurse = True)
+ elif arg.startswith('-subcats'):
+ gen = self.setSubCategoriesGen(arg, 8)
+ elif arg.startswith('-uncatfiles'):
+ gen = UnCategorizedImageGenerator()
+ elif arg.startswith('-uncatcat'):
+ gen = UnCategorizedCategoryGenerator()
+ elif arg.startswith('-uncat'):
+ gen = UnCategorizedPageGenerator()
+ elif arg.startswith('-ref'):
+ referredPageTitle = arg[5:]
+ if not referredPageTitle:
+ referredPageTitle = pywikibot.input(
+ u'Links to which page should be processed?')
+ referredPage = pywikibot.Page(pywikibot.Site(), referredPageTitle)
+ gen = ReferringPageGenerator(referredPage)
+ elif arg.startswith('-links'):
+ linkingPageTitle = arg[7:]
+ if not linkingPageTitle:
+ linkingPageTitle = pywikibot.input(
+ u'Links from which page should be processed?')
+ linkingPage = pywikibot.Page(pywikibot.Site(), linkingPageTitle)
+ gen = LinkedPageGenerator(linkingPage)
+ elif arg.startswith('-weblink'):
+ url = arg[9:]
+ if not url:
+ url = pywikibot.input(
+ u'Pages with which weblink should be processed?')
+ gen = LinksearchPageGenerator(url)
+ elif arg.startswith('-transcludes'):
+ transclusionPageTitle = arg[len('-transcludes:'):]
+ if not transclusionPageTitle:
+ transclusionPageTitle = pywikibot.input(
+ u'Pages that transclude which page should be processed?')
+ transclusionPage = pywikibot.Page(pywikibot.Site(),
+ 'Template:%s' % transclusionPageTitle)
+ gen = ReferringPageGenerator(transclusionPage,
+ onlyTemplateInclusion=True)
+ elif arg.startswith('-start'):
+ if arg.startswith('-startxml'):
+ pywikibot.output(u'-startxml : wrong parameter')
+ raise ValueError
+ firstPageTitle = arg[7:]
+ if not firstPageTitle:
+ firstPageTitle = pywikibot.input(
+ u'At which page do you want to start?')
+ namespace = pywikibot.Page(pywikibot.Site(),
+ firstPageTitle).namespace()
+ firstPageTitle = pywikibot.Page(pywikibot.link(firstPageTitle)
+ ).titleWithoutNamespace()
+ gen = AllpagesPageGenerator(firstPageTitle, namespace,
+ includeredirects=False)
+ elif arg.startswith('-prefixindex'):
+ prefix = arg[13:]
+ namespace = None
+ if not prefix:
+ prefix = pywikibot.input(
+ u'What page names are you looking for?')
+ gen = PrefixingPageGenerator(prefix=prefix)
+ elif arg.startswith('-newimages'):
+ limit = arg[11:] or pywikibot.input(
+ u'How many images do you want to load?')
+ gen = NewimagesPageGenerator(number=int(limit))
+ elif arg.startswith('-new'):
+ if len(arg) >=5:
+ gen = NewpagesPageGenerator(number=int(arg[5:]))
+ else:
+ gen = NewpagesPageGenerator(number=60)
+ elif arg.startswith('-imagelinks'):
+ imagelinkstitle = arg[len('-imagelinks:'):]
+ if not imagelinkstitle:
+ imagelinkstitle = pywikibot.input(
+ u'Images on which page should be processed?')
+ imagelinksPage = pywikibot.Page(pywikibot.Link(imagelinkstitle))
+ gen = ImagesPageGenerator(imagelinksPage)
+ elif arg.startswith('-search'):
+ mediawikiQuery = arg[8:]
+ if not mediawikiQuery:
+ mediawikiQuery = pywikibot.input(
+ u'What do you want to search for?')
+ # In order to be useful, all namespaces are required
+ gen = SearchPageGenerator(mediawikiQuery, namespaces = [])
+ elif arg.startswith('-google'):
+ gen = GoogleSearchPageGenerator(arg[8:])
+ elif arg.startswith('-titleregex'):
+ if len(arg) == 6:
+ regex = pywikibot.input(
+ u'What page names are you looking for?')
+ else:
+ regex = arg[7:]
+ gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex)
+ elif arg.startswith('-yahoo'):
+ gen = YahooSearchPageGenerator(arg[7:])
+ else:
+ return None
+ # make sure all yielded pages are unique
+ gen = DuplicateFilterPageGenerator(gen)
+ return gen
+
+
+class ThreadedGenerator(threading.Thread):
+ """Look-ahead generator class.
+
+ Runs a generator in a separate thread and queues the results; can
+ be called like a regular generator.
+
+ Subclasses should override self.generator, _not_ self.run
+
+ Important: the generator thread will stop itself if the generator's
+ internal queue is exhausted; but, if the calling program does not use
+ all the generated values, it must call the generator's stop() method to
+ stop the background thread. Example usage:
+
+ >>> gen = ThreadedGenerator(target=foo)
+ >>> try:
+ ... for data in gen:
+ ... do_work(data)
+ ... finally:
+ ... gen.stop()
+
+ """ #NOT CURRENTLY USED: Intended for future development
+
+ def __init__(self, group=None, target=None, name="GeneratorThread",
+ args=(), kwargs=None, qsize=65536):
+ """Constructor. Takes same keyword arguments as
threading.Thread.
+
+ target must be a generator function (or other callable that returns
+ an iterable object).
+
+ @param qsize: The size of the lookahead queue. The larger the qsize,
+ the more values will be computed in advance of use (which can eat
+ up memory and processor time).
+ @type qsize: int
+
+ """
+ if kwargs is None:
+ kwargs = {}
+ if target:
+ self.generator = target
+ if not hasattr(self, "generator"):
+ raise RuntimeError("No generator for ThreadedGenerator to run.")
+ self.args, self.kwargs = args, kwargs
+ threading.Thread.__init__(self, group=group, name=name)
+ self.queue = Queue.Queue(qsize)
+ self.finished = threading.Event()
+
+ def __iter__(self):
+ """Iterate results from the queue."""
+ if not self.isAlive() and not self.finished.isSet():
+ self.start()
+ # if there is an item in the queue, yield it, otherwise wait
+ while not self.finished.isSet():
+ try:
+ yield self.queue.get(True, 0.25)
+ except Queue.Empty:
+ pass
+ except KeyboardInterrupt:
+ self.stop()
+
+ def stop(self):
+ """Stop the background thread."""
+## if not self.finished.isSet():
+## pywikibot.output("DEBUG: signalling %s to stop." % self)
+ self.finished.set()
+
+ def run(self):
+ """Run the generator and store the results on the
queue."""
+ self.__gen = self.generator(*self.args, **self.kwargs)
+ for result in self.__gen:
+ while True:
+ if self.finished.isSet():
+## pywikibot.output("DEBUG: %s received stop signal." %
self)
+ return
+ try:
+ self.queue.put_nowait(result)
+ except Queue.Full:
+ time.sleep(0.25)
+ continue
+ break
+ # wait for queue to be emptied, then kill the thread
+ while not self.finished.isSet() and not self.queue.empty():
+ time.sleep(0.25)
+ self.stop()
+## pywikibot.output("DEBUG: %s stopped because generator exhausted." %
self)
+
+
+def AllpagesPageGenerator(start ='!', namespace=None, includeredirects=True,
+ site=None):
+ """
+ Using the Allpages special page, retrieve all articles' titles, and yield
+ page objects.
+ If includeredirects is False, redirects are not included. If
+ includeredirects equals the string 'only', only redirects are added.
+ """
+ if site is None:
+ site = pywikibot.getSite()
+ if includeredirects:
+ if includeredirects == 'only':
+ filterredir = True
+ else:
+ filterredir = None
+ else:
+ filterredir = False
+ return site.allpages(start=start, namespace=namespace,
+ filterredir=filterredir)
+
+
+def PrefixingPageGenerator(prefix, namespace=None, includeredirects=True,
+ site=None):
+ if site is None:
+ site = pywikibot.Site()
+ page = pywikibot.Page(site, prefix)
+ if namespace is None:
+ namespace = page.namespace()
+ title = page.titleWithoutNamespace()
+ if includeredirects:
+ if includeredirects == 'only':
+ filterredir = True
+ else:
+ filterredir = None
+ else:
+ filterredir = False
+ return site.allpages(prefix=title, namespace=namespace,
+ filterredir=filterredir)
+
+
+def NewpagesPageGenerator(number=100, get_redirect=False, repeat=False,
+ site=None):
+ # API does not (yet) have a newpages function, so this tries to duplicate
+ # it by filtering the recentchanges output
+ # defaults to namespace 0 because that's how Special:Newpages defaults
+ if site is None:
+ site = pywikibot.Site()
+ return site.recentchanges(limit=number, showredirects=get_redirect,
+ changetype="new", namespaces=0)
+
+
+def FileLinksGenerator(referredImagePage):
+ return referredImagePage.usingPages()
+
+
+def ImagesPageGenerator(pageWithImages):
+ return pageWithImages.imagelinks()
+
+
+def InterwikiPageGenerator(page):
+ """Iterator over all interwiki (non-language) links on a
page."""
+ for link in page.interwiki():
+ yield pywikibot.Page(link)
+
+
+def LanguageLinksPageGenerator(page):
+ """Iterator over all interwiki language links on a
page."""
+ for link in page.langlinks():
+ yield pywikibot.Page(link)
+
+
+def ReferringPageGenerator(referredPage, followRedirects=False,
+ withTemplateInclusion=True,
+ onlyTemplateInclusion=False):
+ '''Yields all pages referring to a specific page.'''
+ return referredPage.getReferences(
+ follow_redirects=followRedirects,
+ withTemplateInclusion=withTemplateInclusion,
+ onlyTemplateInclusion=onlyTemplateInclusion)
+
+
+def CategorizedPageGenerator(category, recurse=False, start=None):
+ '''Yield all pages in a specific category.
+
+ If recurse is True, pages in subcategories are included as well; if
+ recurse is an int, only subcategories to that depth will be included
+ (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
+ not go any further).
+ If start is a string value, only pages whose sortkey comes after start
+ alphabetically are included.
+
+ ''' # TODO: page generator could be modified to use cmstartsortkey ...
+ for a in category.articles(recurse=recurse):
+ if start is None or a.title(withNamespace=False) >= start:
+ yield a
+
+
+def SubCategoriesPageGenerator(category, recurse=False, start=None):
+ '''Yields all subcategories in a specific category.
+
+ If recurse is True, pages in subcategories are included as well; if
+ recurse is an int, only subcategories to that depth will be included
+ (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
+ not go any further).
+ If start is a string value, only categories whose sortkey comes after
+ start alphabetically are included.
+
+ ''' # TODO: page generator could be modified to use cmstartsortkey ...
+ for s in category.subcategories(recurse=recurse):
+ if start is None or s.title(withNamespace=False) >= start:
+ yield s
+
+
+def LinkedPageGenerator(linkingPage):
+ """Yields all pages linked from a specific page."""
+ return linkingPage.linkedPages()
+
+
+def TextfilePageGenerator(filename=None, site=None):
+ """Iterate pages from a list in a text file.
+
+ The file must contain page links between double-square-brackets. The
+ generator will yield each corresponding Page object.
+
+ @param filename: the name of the file that should be read. If no name is
+ given, the generator prompts the user.
+ @param site: the default Site for which Page objects should be created
+
+ """
+ if filename is None:
+ filename = pywikibot.input(u'Please enter the filename:')
+ if site is None:
+ site = pywikibot.Site()
+ f = codecs.open(filename, 'r', config.textfile_encoding)
+ for linkmatch in Rlink.finditer(f.read()):
+ # If the link is in interwiki format, the Page object may reside
+ # on a different Site than the default.
+ # This makes it possible to work on different wikis using a single
+ # text file, but also could be dangerous because you might
+ # inadvertently change pages on another wiki!
+ yield pywikibot.Page(pywikibot.Link(linkmatch.groups("title"), site))
+ f.close()
+
+
+def PagesFromTitlesGenerator(iterable, site=None):
+ """Generate pages from the titles (unicode strings) yielded by
iterable."""
+ if site is None:
+ site = pywikibot.Site()
+ for title in iterable:
+ if not isinstance(title, basestring):
+ break
+ yield pywikibot.Page(pywikibot.Link(title, site))
+
+
+def UserContributionsGenerator(username, number=250, namespaces=None,
+ site=None):
+ """Yields number unique pages edited by user:username
+ namespaces : list of namespace numbers to fetch contribs from
+
+ """
+ if site is None:
+ site = pywikibot.Site()
+ return site.usercontribs(user=username, limit=number, namespaces=namespaces)
+
+
+def NamespaceFilterPageGenerator(generator, namespaces, site=None):
+ """
+ Wraps around another generator. Yields only those pages that are in one
+ of the given namespaces.
+
+ The namespace list can contain both integers (namespace numbers) and
+ strings/unicode strings (namespace names).
+
+ """
+ if site is None:
+ site = pywikibot.Site()
+ # convert namespace names to namespace numbers
+ for i in xrange(len(namespaces)):
+ ns = namespaces[i]
+ if isinstance(ns, basestring):
+ index = site.getNamespaceIndex(ns)
+ if index is None:
+ raise ValueError(u'Unknown namespace: %s' % ns)
+ namespaces[i] = index
+ for page in generator:
+ if page.namespace() in namespaces:
+ yield page
+
+
+def RedirectFilterPageGenerator(generator):
+ """Yields pages from another generator that are not
redirects."""
+ for page in generator:
+ if not page.isRedirectPage():
+ yield page
+
+
+def DuplicateFilterPageGenerator(generator):
+ """Yield all unique pages from another generator, omitting
duplicates."""
+ seenPages = {}
+ for page in generator:
+ if page not in seenPages:
+ seenPages[page] = None
+ yield page
+
+
+def RegexFilterPageGenerator(generator, regex):
+ """Yield pages from another generator whose titles match
regex."""
+ reg = re.compile(regex, re.I)
+ for page in generator:
+ if reg.match(page.titleWithoutNamespace()):
+ yield page
+
+
+def CombinedPageGenerator(generators):
+ return itertools.chain(*generators)
+
+
+def CategoryGenerator(generator):
+ """Yield pages from another generator as Category objects.
+
+ Makes sense only if it is ascertained that only categories are being
+ retrieved.
+
+ """
+ for page in generator:
+ yield pywikibot.Category(page)
+
+
+def PageWithTalkPageGenerator(generator):
+ """
+ Wraps around another generator. Yields the same pages, but for non-talk
+ pages, it also includes associated talk pages.
+ This generator does not check if the talk page in fact exists.
+ """
+ for page in generator:
+ yield page
+ if not page.isTalkPage():
+ yield page.toggleTalkPage()
+
+
+def PreloadingGenerator(self, generator, pageNumber=60, lookahead=10):
+ """Yield preloaded pages taken from another
generator."""
+
+ # pages may be on more than one site, for example if an interwiki
+ # generator is used, so use a separate preloader for each site
+ sites = {}
+ # build a list of pages for each site found in the iterator
+ for page in generator:
+ sites.setdefault(page.site(), []).append(page)
+ return itertools.chain(site.preloadpages(sites[site], pageNumber)
+ for site in sites)
+
+
+#TODO below
+
+def UnusedFilesGenerator(number=100, repeat=False, site=None, extension=None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.unusedfiles(number=number, repeat=repeat,
+ extension=extension):
+ yield pywikibot.ImagePage(page.site(), page.title())
+
+def WithoutInterwikiPageGenerator(number=100, repeat=False, site=None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.withoutinterwiki(number=number, repeat=repeat):
+ yield page
+
+def UnCategorizedCategoryGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.uncategorizedcategories(number=number, repeat=repeat):
+ yield page
+
+def UnCategorizedImageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.uncategorizedimages(number=number, repeat=repeat):
+ yield page
+
+def NewimagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.newimages(number, repeat=repeat):
+ yield page[0]
+
+def UnCategorizedPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.uncategorizedpages(number=number, repeat=repeat):
+ yield page
+
+def LonelyPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.lonelypages(number=number, repeat=repeat):
+ yield page
+
+def UnwatchedPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.unwatchedpages(number=number, repeat=repeat):
+ yield page
+
+def AncientPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.ancientpages(number=number, repeat=repeat):
+ yield page[0]
+
+def DeadendPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.deadendpages(number=number, repeat=repeat):
+ yield page
+
+def LongPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.longpages(number=number, repeat=repeat):
+ yield page[0]
+
+def ShortPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.shortpages(number=number, repeat=repeat):
+ yield page[0]
+
+def LinksearchPageGenerator(link, step=500, site=None):
+ """Yields all pages that include a specified link, according to
+ [[Special:Linksearch]].
+
+ """
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.linksearch(link, limit=step):
+ yield page
+
+def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
+ """
+ Provides a list of results using the internal MediaWiki search engine
+ """
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.search(query, number=number, namespaces = namespaces):
+ yield page[0]
+
+class YahooSearchPageGenerator:
+ '''
+ To use this generator, install pYsearch
+ '''
+ def __init__(self, query = None, count = 100, site = None): # values larger than 100
fail
+ self.query = query or pywikibot.input(u'Please enter the search query:')
+ self.count = count
+ if site is None:
+ site = pywikibot.Site()
+ self.site = site
+
+ def queryYahoo(self, query):
+ from yahoo.search.web import WebSearch
+ srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
+
+ dom = srch.get_results()
+ results = srch.parse_results(dom)
+ for res in results:
+ url = res.Url
+ yield url
+
+ def __iter__(self):
+ # restrict query to local site
+ localQuery = '%s site:%s' % (self.query, self.site.hostname())
+ base = 'http://%s%s' % (self.site.hostname(),
self.site.nice_get_address(''))
+ for url in self.queryYahoo(localQuery):
+ if url[:len(base)] == base:
+ title = url[len(base):]
+ page = pywikibot.Page(self.site, title)
+ yield page
+
+class GoogleSearchPageGenerator:
+ '''
+ To use this generator, you must install the pyGoogle module from
+
http://pygoogle.sf.net/ and get a Google Web API license key from
+
http://www.google.com/apis/index.html . The google_key must be set to your
+ license key in your configuration.
+ '''
+ def __init__(self, query = None, site = None):
+ self.query = query or pywikibot.input(u'Please enter the search query:')
+ if site is None:
+ site = pywikibot.Site()
+ self.site = site
+
+ #########
+ # partially commented out because it is probably not in compliance with Google's
"Terms of
+ # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
+ def queryGoogle(self, query):
+ #if config.google_key:
+ if True:
+ #try:
+ for url in self.queryViaSoapApi(query):
+ yield url
+ return
+ #except ImportError:
+ #pass
+ # No google license key, or pygoogle not installed. Do it the ugly way.
+ #for url in self.queryViaWeb(query):
+ # yield url
+
+ def queryViaSoapApi(self, query):
+ import google
+ google.LICENSE_KEY = config.google_key
+ offset = 0
+ estimatedTotalResultsCount = None
+ while not estimatedTotalResultsCount or offset < estimatedTotalResultsCount:
+ while (True):
+ # Google often yields 502 errors.
+ try:
+ pywikibot.output(u'Querying Google, offset %i' % offset)
+ data = google.doGoogleSearch(query, start = offset, filter = False)
+ break
+ except KeyboardInterrupt:
+ raise
+ except:
+ # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway)
+ # can happen here, depending on the module used. It's not easy
+ # to catch this properly because pygoogle decides which one of
+ # the soap modules to use.
+ pywikibot.output(u"An error occured. Retrying in 10
seconds...")
+ time.sleep(10)
+ continue
+
+ for result in data.results:
+ #print 'DBG: ', result.URL
+ yield result.URL
+ # give an estimate of pages to work on, but only once.
+ if not estimatedTotalResultsCount:
+ pywikibot.output(u'Estimated total result count: %i pages.' %
data.meta.estimatedTotalResultsCount)
+ estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount
+ #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount
+ offset += 10
+
+ #########
+ # commented out because it is probably not in compliance with Google's
"Terms of
+ # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
+
+ #def queryViaWeb(self, query):
+ #"""
+ #Google has stopped giving out API license keys, and sooner or later
+ #they will probably shut down the service.
+ #This is a quick and ugly solution: we just grab the search results from
+ #the normal web interface.
+ #"""
+ #linkR = re.compile(r'<a href="([^>"]+?)"
class=l>', re.IGNORECASE)
+ #offset = 0
+
+ #while True:
+ #pywikibot.output("Google: Querying page %d" % (offset / 100 + 1))
+ #address =
"http://www.google.com/search?q=%s&num=100&hl=en&start=%d" %
(urllib.quote_plus(query), offset)
+ ## we fake being Firefox because Google blocks unknown browsers
+ #request = urllib2.Request(address, None, {'User-Agent':
'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1
Firefox/1.5'})
+ #urlfile = urllib2.urlopen(request)
+ #page = urlfile.read()
+ #urlfile.close()
+ #for url in linkR.findall(page):
+ #yield url
+ #if "<div id=nn>" in page: # Is there a "Next" link
for next page of results?
+ #offset += 100 # Yes, go to next page of results.
+ #else:
+ #return
+ #########
+
+ def __iter__(self):
+ # restrict query to local site
+ localQuery = '%s site:%s' % (self.query, self.site.hostname())
+ base = 'http://%s%s' % (self.site.hostname(),
self.site.nice_get_address(''))
+ for url in self.queryGoogle(localQuery):
+ if url[:len(base)] == base:
+ title = url[len(base):]
+ page = pywikibot.Page(self.site, title)
+ # Google contains links in the format
http://de.wikipedia.org/wiki/en:Foobar
+ if page.site() == self.site:
+ yield page
+
+def MySQLPageGenerator(query, site = None):
+ import MySQLdb as mysqldb
+ if site is None:
+ site = pywikibot.Site()
+ conn = mysqldb.connect(config.db_hostname, db = site.dbName(),
+ user = config.db_username,
+ passwd = config.db_password)
+ cursor = conn.cursor()
+ pywikibot.output(u'Executing query:\n%s' % query)
+ query = query.encode(site.encoding())
+ cursor.execute(query)
+ while True:
+ try:
+ namespaceNumber, pageName = cursor.fetchone()
+ print namespaceNumber, pageName
+ except TypeError:
+ # Limit reached or no more results
+ break
+ #print pageName
+ if pageName:
+ namespace = site.namespace(namespaceNumber)
+ pageName = unicode(pageName, site.encoding())
+ if namespace:
+ pageTitle = '%s:%s' % (namespace, pageName)
+ else:
+ pageTitle = pageName
+ page = pywikibot.Page(site, pageTitle)
+ yield page
+
+def YearPageGenerator(start = 1, end = 2050, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ pywikibot.output(u"Starting with year %i" % start)
+ for i in xrange(start, end + 1):
+ if i % 100 == 0:
+ pywikibot.output(u'Preparing %i...' % i)
+ # There is no year 0
+ if i != 0:
+ current_year = date.formatYear(site.lang, i )
+ yield pywikibot.Page(site, current_year)
+
+def DayPageGenerator(startMonth = 1, endMonth = 12, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ fd = date.FormatDate(site)
+ firstPage = pywikibot.Page(site, fd(startMonth, 1))
+ pywikibot.output(u"Starting with %s" % firstPage.aslink())
+ for month in xrange(startMonth, endMonth+1):
+ for day in xrange(1, date.getNumberOfDaysInMonth(month)+1):
+ yield pywikibot.Page(site, fd(month, day))
+
+
+if __name__ == "__main__":
+ try:
+ gen = None
+ genFactory = GeneratorFactory()
+ for arg in pywikibot.handleArgs():
+ generator = genFactory.handleArg(arg)
+ if generator:
+ gen = generator
+ if gen:
+ for page in gen:
+ pywikibot.output(page.title(), toStdout = True)
+ else:
+ pywikibot.showHelp()
+ finally:
+ pywikibot.stopme()
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-12-09 22:38:39 UTC (rev 6135)
+++ branches/rewrite/pywikibot/site.py 2008-12-10 18:56:59 UTC (rev 6136)
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+ # -*- coding: utf-8 -*-
"""
Objects representing MediaWiki sites (wikis) and families (groups of wikis
on the same topic in different languages).