Revision: 8479
Author: russblau
Date: 2010-09-02 16:32:23 +0000 (Thu, 02 Sep 2010)
Log Message:
-----------
Support all API options for generator=categorymembers
Modified Paths:
--------------
branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2010-09-02 16:31:28 UTC (rev 8478)
+++ branches/rewrite/pywikibot/site.py 2010-09-02 16:32:23 UTC (rev 8479)
@@ -1396,7 +1396,9 @@
step=step, total=total, g_content=content)
return tlgen
- def categorymembers(self, category, namespaces=None, step=None, total=None,
+ def categorymembers(self, category, namespaces=None, sortby="",
+ reverse=False, starttime=None, endtime=None,
+ startsort=None, endsort=None, step=None, total=None,
content=False):
"""Iterate members of specified category.
@@ -1407,6 +1409,25 @@
however, that the iterated values are always Page objects, even
if in the Category or Image namespace.
@type namespaces: list of ints
+ @param sortby: determines the order in which results are generated,
+ valid values are "sortkey" (default, results ordered by category
+ sort key) or "timestamp" (results ordered by time page was
+ added to the category)
+ @type sortby: str
+ @param reverse: if True, generate results in reverse order
+ (default False)
+ @param starttime: if provided, only generate pages added after this
+ time; not valid unless sortby="timestamp"
+ @type starttime: pywikibot.Timestamp
+ @param endtime: if provided, only generate pages added before this
+ time; not valid unless sortby="timestamp"
+ @type endtime: pywikibot.Timestamp
+ @param startsort: if provided, only generate pages >= this title
+ lexically; not valid if sortby="timestamp"
+ @type startsort: str
+ @param endsort: if provided, only generate pages <= this title
+ lexically; not valid if sortby="timestamp"
+ @type endsort: str
@param content: if True, load the current content of each iterated page
(default False)
@@ -1416,20 +1437,51 @@
u"categorymembers: non-Category page '%s' specified"
% category.title())
cmtitle = category.title(withSection=False).encode(self.encoding())
- cmgen = self._generator(api.PageGenerator,
- type_arg="categorymembers",
- gcmtitle=cmtitle,
- gcmprop="ids|title|sortkey",
-# namespaces=namespaces, # see note below
- step=step,
- total=total,
- g_content=content)
-# workaround for https://bugzilla.wikimedia.org/show_bug.cgi?id=19640:
- if namespaces:
- if not isinstance(namespaces, list):
- namespaces = [namespaces]
- cmgen = pagegenerators.NamespaceFilterPageGenerator(
- cmgen, namespaces, site=self)
+ cmargs = dict(type_arg="categorymembers",
+ gcmtitle=cmtitle,
+ gcmprop="ids|title|sortkey")
+ if sortby in ["sortkey", "timestamp"]:
+ cmargs["gcmsort"] = sortby
+ elif sortby:
+ raise ValueError(
+ "categorymembers: invalid sortby value '%(sortby)s'"
+ % locals())
+ if starttime and endtime and starttime > endtime:
+ raise ValueError(
+ "categorymembers: starttime must be before endtime")
+ if startsort and endsort and startsort > endsort:
+ raise ValueError(
+ "categorymembers: startsort must be less than endsort")
+ if reverse:
+ cmargs["gcmdir"] = "desc"
+ # API wants start/end params in opposite order if using descending
+ # sort; we take care of this reversal for the user
+ (starttime, endtime) = (endtime, starttime)
+ (startsort, endsort) = (endsort, startsort)
+ if starttime and sortby == "timestamp":
+ cmargs["gcmstart"] = str(starttime)
+ elif starttime:
+ raise ValueError(
+ "categorymembers: invalid combination of 'sortby' and 'starttime'")
+ if endtime and sortby == "timestamp":
+ cmargs["gcmend"] = str(endtime)
+ elif endtime:
+ raise ValueError(
+ "categorymembers: invalid combination of 'sortby' and 'endtime'")
+ if startsort and sortby != "timestamp":
+ cmargs["gcmstartsortkey"] = startsort
+ elif startsort:
+ raise ValueError(
+ "categorymembers: invalid combination of 'sortby' and 'startsort'")
+ if endsort and sortby != "timestamp":
+ cmargs["gcmendsortkey"] = endsort
+ elif endsort:
+ raise ValueError(
+ "categorymembers: invalid combination of 'sortby' and 'endsort'")
+
+ cmgen = self._generator(api.PageGenerator, namespaces=namespaces,
+ step=step, total=total, g_content=content,
+ **cmargs)
return cmgen
def loadrevisions(self, page=None, getText=False, revids=None,
Revision: 8478
Author: russblau
Date: 2010-09-02 16:31:28 +0000 (Thu, 02 Sep 2010)
Log Message:
-----------
Some API queries can have no results but still have a query-continue key, so iteration should continue even with no results
Modified Paths:
--------------
branches/rewrite/pywikibot/data/api.py
Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py 2010-09-02 15:04:47 UTC (rev 8477)
+++ branches/rewrite/pywikibot/data/api.py 2010-09-02 16:31:28 UTC (rev 8478)
@@ -545,40 +545,40 @@
% self.__class__.__name__,
_logger)
return
- if not ("query" in self.data
- and self.resultkey in self.data["query"]):
+ if "query" not in self.data:
pywikibot.debug(
-u"%s: stopped iteration because 'query' and '%s' not found in api response."
+u"%s: stopped iteration because 'query' not found in api response."
% (self.__class__.__name__, self.resultkey),
_logger)
pywikibot.debug(unicode(self.data), _logger)
return
- resultdata = self.data["query"][self.resultkey]
- if isinstance(resultdata, dict):
- pywikibot.debug(u"%s received %s; limit=%s"
- % (self.__class__.__name__,
- resultdata.keys(),
- self.limit),
- _logger)
- resultdata = [resultdata[k] for k in sorted(resultdata.keys())]
- else:
- pywikibot.debug(u"%s received %s; limit=%s"
- % (self.__class__.__name__,
- resultdata,
- self.limit),
- _logger)
- if "normalized" in self.data["query"]:
- self.normalized = dict((item['to'], item['from'])
- for item in
- self.data["query"]["normalized"])
- else:
- self.normalized = {}
- for item in resultdata:
- yield self.result(item)
- count += 1
- if self.limit is not None and self.limit > 0 \
- and count >= self.limit:
- return
+ if self.resultkey in self.data["query"]:
+ resultdata = self.data["query"][self.resultkey]
+ if isinstance(resultdata, dict):
+ pywikibot.debug(u"%s received %s; limit=%s"
+ % (self.__class__.__name__,
+ resultdata.keys(),
+ self.limit),
+ _logger)
+ resultdata = [resultdata[k] for k in sorted(resultdata.keys())]
+ else:
+ pywikibot.debug(u"%s received %s; limit=%s"
+ % (self.__class__.__name__,
+ resultdata,
+ self.limit),
+ _logger)
+ if "normalized" in self.data["query"]:
+ self.normalized = dict((item['to'], item['from'])
+ for item in
+ self.data["query"]["normalized"])
+ else:
+ self.normalized = {}
+ for item in resultdata:
+ yield self.result(item)
+ count += 1
+ if self.limit is not None and self.limit > 0 \
+ and count >= self.limit:
+ return
if not "query-continue" in self.data:
return
if not self.continuekey in self.data["query-continue"]:
Revision: 8475
Author: russblau
Date: 2010-09-02 13:56:42 +0000 (Thu, 02 Sep 2010)
Log Message:
-----------
Bugfix: self.api_number must be integer or None, not string
Modified Paths:
--------------
branches/rewrite/scripts/redirect.py
Modified: branches/rewrite/scripts/redirect.py
===================================================================
--- branches/rewrite/scripts/redirect.py 2010-09-01 09:33:55 UTC (rev 8474)
+++ branches/rewrite/scripts/redirect.py 2010-09-02 13:56:42 UTC (rev 8475)
@@ -197,9 +197,9 @@
self.api_start = start
self.api_until = until
self.api_number = number
- if self.api_number is None:
- self.api_number = 'max'
+# note: rewrite branch does not yet support XML dumps, so this is commented out
+# until that support is added
## def get_redirects_from_dump(self, alsoGetPageTitles=False):
## '''
## Load a local XML dump file, look at all pages which have the
@@ -368,7 +368,7 @@
count += 1
if count >= self.api_number:
break
-
+# TODO: add XML dump support
## elif self.xmlFilename == None:
## # retrieve information from the live wiki's maintenance page
## # broken redirect maintenance page's URL
@@ -410,7 +410,7 @@
count += 1
if count >= self.api_number:
break
-
+# TODO: API cannot yet deliver contents of "special" pages
## elif self.xmlFilename == None:
## # retrieve information from the live wiki's maintenance page
## # double redirect maintenance page's URL
Revision: 8474
Author: purodha
Date: 2010-09-01 09:33:55 +0000 (Wed, 01 Sep 2010)
Log Message:
-----------
A set of cosmetic changes to config.py - mostly better English in comments, and
the section collecting family names from the file system, that does imho not
belong to the user configurable settings, has been moved behind those.
Modified Paths:
--------------
trunk/pywikipedia/config.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2010-08-30 16:33:29 UTC (rev 8473)
+++ trunk/pywikipedia/config.py 2010-09-01 09:33:55 UTC (rev 8474)
@@ -50,7 +50,7 @@
# exception CaptchaError being thrown if a captcha is encountered.
solve_captcha = True
-# Some sites will require password identication to access the HTML pages at
+# Some sites will require password authentication to access the HTML pages at
# the site. If you have any such site, add lines to your user-config.py of
# the following form:
#
@@ -64,9 +64,7 @@
# 2. You must use the hostname of the site, not its family/language pair
authenticate = {}
-#
-# Security Connection for Wikimedia Projects
-#
+# Secure Connection to all Wikimedia Projects
SSL_connection = False
# password_file = ".passwd"
@@ -79,22 +77,8 @@
use_api_login = True
# Enable data recieve from all avalible API.
-
use_api = True
-# Get the names of all known families, and initialize
-# with empty dictionaries
-import wikipediatools as _wt
-_base_dir = _wt.get_base_dir()
-_RfamilyFile = re.compile('(?P<name>.+)_family.py$')
-for _filename in os.listdir(os.path.join(_base_dir, 'families')):
- _m = _RfamilyFile.match(_filename)
- if _m:
- familyName = _m.group('name')
- usernames[familyName] = {}
- sysopnames[familyName] = {}
- disambiguation_comment[familyName] = {}
-
# Display a warning message if your edits appear in recent changes page
notify_unflagged_bot = True
@@ -334,9 +318,8 @@
# You can signup an API key from http://code.google.com/apis/ajaxsearch/signup.html.
google_key = ''
-
-# using Google AJAX Search API, it require the refer website, this variable save the refer web address
-# when you sign up the Key.
+# using Google AJAX Search API, it requires the referer website, this variable saves the referer web address
+# when you sign up with the key.
google_api_refer = ''
# Some scripts allow using the Yahoo! Search Web Services. To use this feature,
@@ -355,9 +338,9 @@
'reviewer': None, # If so, under what reviewer name?
}
-# for all connection proxy handle
+# for all connections: proxy handle
# to use it, proxy['host'] have to support HTTP and include port number (e.g. localhost:8080)
-# if proxy server neen authentication, set ('ID', 'PASSWORD') to proxy['auth'].
+# if the proxy server needs authentication, set ('ID', 'PASSWORD') to proxy['auth'].
proxy = {
'host': None,
'auth': None,
@@ -371,17 +354,16 @@
copyright_msn = False
# Perform a deep check, loading URLs to search if 'Wikipedia' is present.
-# This may be useful to improve number of correct results. If you haven't
-# a fast connection, you might want to keep they disabled.
+# This may be useful to increase the number of correct results. If you haven't
+# a fast connection, you might want to keep them disabled.
copyright_check_in_source_google = False
copyright_check_in_source_yahoo = False
copyright_check_in_source_msn = False
-# Web pages may content a Wikipedia text without 'Wikipedia' word but with
-# typical '[edit]' tag result of copy & paste procedure. You can want no
-# report for this kind of URLs, even if they are copyright violation.
-# However, when enabled these URLs are logged in a file.
-
+# Web pages may contain a Wikipedia text without the word 'Wikipedia' but with the
+# typical '[edit]' tag as a result of a copy & paste procedure. You want no
+# report for this kind of URLs, even if they are copyright violations.
+# However, when enabled, these URLs are logged in a file.
copyright_check_in_source_section_names = False
# Limit number of queries for page.
@@ -401,7 +383,6 @@
# 1 = Disable search engine
# 2 = Sleep (default)
# 3 = Stop
-
copyright_exceeded_in_queries = 2
copyright_exceeded_in_queries_sleep_hours = 6
@@ -411,12 +392,11 @@
# Append length of URL to script result
copyright_show_length = True
-# By default the script try to identify and skip text that contents a wide
+# By default the script tries to identify and skip text that contains a large
# comma separated list or only numbers. But sometimes that might be the
# only part unmodified of a slightly edited and not otherwise reported
-# copyright violation. You can disable this feature to try to increase
+# copyright violation. You can disable this feature to try to increase the
# number of results.
-
copyright_economize_query = True
############## HTTP SETTINGS ##############
@@ -431,7 +411,6 @@
############## FURTHER SETTINGS ##############
-
# The bot can make some additional changes to each page it edits, e.g. fix
# whitespace or positioning of interwiki and category links.
@@ -474,6 +453,19 @@
# End of configuration section
# ============================
+
+# Get the names of all known families, and initialize
+# with empty dictionaries
+import wikipediatools as _wt
+_base_dir = _wt.get_base_dir()
+_RfamilyFile = re.compile('(?P<name>.+)_family.py$')
+for _filename in os.listdir(os.path.join(_base_dir, 'families')):
+ _m = _RfamilyFile.match(_filename)
+ if _m:
+ familyName = _m.group('name')
+ usernames[familyName] = {}
+ sysopnames[familyName] = {}
+ disambiguation_comment[familyName] = {}
# System-level and User-level changes.
# Store current variables and their types.
_glv = {}