Revision: 4405
Author: leogregianin
Date: 2007-10-03 16:48:25 +0000 (Wed, 03 Oct 2007)
Log Message:
-----------
Patch 1800925: wikipedia.search and page generator by John Vandenberg (NOTE: I didn't obtain to function this)
Modified Paths:
--------------
trunk/pywikipedia/family.py
trunk/pywikipedia/pagegenerators.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py 2007-10-03 14:55:17 UTC (rev 4404)
+++ trunk/pywikipedia/family.py 2007-10-03 16:48:25 UTC (rev 4405)
@@ -2613,6 +2613,30 @@
def api_address(self, code):
return '%s?' % self.apipath(code)
+ def search_address(self, code, query, limit=100, namespaces = None):
+ """
+ Constructs a URL for searching using Special:Search
+ 'namespaces' may be an int or a list; an empty list selects
+ all namespaces. Defaults to namespace 0
+ """
+ namespace_params = ''
+ if namespaces is not None:
+ if isinstance(namespaces, int):
+ namespace_params = "&ns%d=1" % namespaces
+ elif isinstance (namespaces, list):
+ if len(namespaces) == 0:
+ # add all namespaces
+ namespaces = self.namespaces.keys()
+ for i in namespaces:
+ if i > 0:
+ namespace_params = namespace_params + '&ns%d=1' % i
+
+ return "%s?title=%s:Search&search=%s&limit=%d%s" % (self.path(code),
+ self.special_namespace_url(code),
+ query,
+ limit,
+ namespace_params)
+
def allpages_address(self, code, start, namespace = 0):
if self.version(code)=="1.2":
return '%s?title=%s:Allpages&printable=yes&from=%s' % (
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2007-10-03 14:55:17 UTC (rev 4404)
+++ trunk/pywikipedia/pagegenerators.py 2007-10-03 16:48:25 UTC (rev 4405)
@@ -36,6 +36,9 @@
Depends on python module pYsearch. See yahoo_appid in
config.py for instructions.
+-search Work on all pages that are found in a MediaWiki search
+ across all namespaces.
+
-google Work on all pages that are found in a Google search.
You need a Google Web API license key. Note that Google
doesn't give out license keys anymore. See google_key in
@@ -290,6 +293,15 @@
yield wikipedia.Page(site, pagenameofthelink)
offset += step
+def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
+ """
+ Provides a list of results using the internal MediaWiki search engine
+ """
+ if site is None:
+ site = wikipedia.getSite()
+ for page in site.search(query, number=number, namespaces = namespaces):
+ yield page[0]
+
class YahooSearchPageGenerator:
'''
To use this generator, install pYsearch
@@ -745,6 +757,14 @@
gen = NewpagesPageGenerator(number = int(arg[5:]))
else:
gen = NewpagesPageGenerator(number = 60)
+ elif arg.startswith('-search'):
+ if len(arg) == 8:
+ mediawikiQuery = wikipedia.input(u'What do you want to search for?')
+ else:
+ mediawikiQuery = arg[8:]
+ # In order to be useful, all namespaces are required
+ gen = SearchPageGenerator(mediawikiQuery, namespaces = [])
+
elif arg.startswith('-google'):
if len(arg) == 7:
googleQuery = wikipedia.input(u'What do you want to search for?')
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-10-03 14:55:17 UTC (rev 4404)
+++ trunk/pywikipedia/wikipedia.py 2007-10-03 16:48:25 UTC (rev 4405)
@@ -3666,6 +3666,44 @@
except KeyError:
return False
+ def search(self, query, number = 10, repeat = False, namespaces = None):
+ """
+ Generator which yields search results
+ """
+ seen = set()
+ throttle = True
+ while True:
+ path = self.search_address(query, n=number, ns = namespaces)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(ur'<li[^>]*><a href=".+?" title="(?P<title>.+?)">.+?</a>'
+ '(?P<match>.*?)<br ?/><span[^>]*>Relevance: '
+ '(?P<relevance>[0-9.]+)% - '
+ '(?P<size>[0-9.]+) '
+ '(?P<sizeunit>[A-Za-z]+) '
+ '\((?P<words>.+?) words\) - '
+ '(?P<date>.+?)</span></li>', re.DOTALL)
+
+ for m in entryR.finditer(html):
+ title = m.group('title')
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+
+ match = m.group('match')
+ relevance = m.group('relevance')
+ size = m.group('size')
+ # sizeunit appears to always be "KB"
+ words = m.group('words')
+ date = m.group('date')
+
+ #print "%s - %s %s (%s words) - %s" % (relevance, size, sizeunit, words, date)
+
+ yield page, match, relevance, size, words, date
+ if not repeat:
+ break
+
# TODO: avoid code duplication for the following methods
def newpages(self, number = 10, get_redirect = False, repeat = False):
"""Generator which yields new articles subsequently.
@@ -4213,6 +4251,9 @@
if self.encoding().lower() != charset.lower():
raise ValueError("code2encodings has wrong charset for %s. It should be %s, but is %s" % (repr(self), charset, self.encoding()))
+ def search_address(self, q, n=50, ns = 0):
+ return self.family.search_address(self.lang, q, n, ns)
+
def allpages_address(self, s, ns = 0):
return self.family.allpages_address(self.lang, start = s, namespace = ns)