Revision: 4405 Author: leogregianin Date: 2007-10-03 16:48:25 +0000 (Wed, 03 Oct 2007)
Log Message: ----------- Patch 1800925: wikipedia.search and page generator by John Vandenberg (NOTE: I didn't obtain to function this)
Modified Paths: -------------- trunk/pywikipedia/family.py trunk/pywikipedia/pagegenerators.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/family.py =================================================================== --- trunk/pywikipedia/family.py 2007-10-03 14:55:17 UTC (rev 4404) +++ trunk/pywikipedia/family.py 2007-10-03 16:48:25 UTC (rev 4405) @@ -2613,6 +2613,30 @@ def api_address(self, code): return '%s?' % self.apipath(code)
+ def search_address(self, code, query, limit=100, namespaces = None): + """ + Constructs a URL for searching using Special:Search + 'namespaces' may be an int or a list; an empty list selects + all namespaces. Defaults to namespace 0 + """ + namespace_params = '' + if namespaces is not None: + if isinstance(namespaces, int): + namespace_params = "&ns%d=1" % namespaces + elif isinstance (namespaces, list): + if len(namespaces) == 0: + # add all namespaces + namespaces = self.namespaces.keys() + for i in namespaces: + if i > 0: + namespace_params = namespace_params + '&ns%d=1' % i + + return "%s?title=%s:Search&search=%s&limit=%d%s" % (self.path(code), + self.special_namespace_url(code), + query, + limit, + namespace_params) + def allpages_address(self, code, start, namespace = 0): if self.version(code)=="1.2": return '%s?title=%s:Allpages&printable=yes&from=%s' % (
Modified: trunk/pywikipedia/pagegenerators.py =================================================================== --- trunk/pywikipedia/pagegenerators.py 2007-10-03 14:55:17 UTC (rev 4404) +++ trunk/pywikipedia/pagegenerators.py 2007-10-03 16:48:25 UTC (rev 4405) @@ -36,6 +36,9 @@ Depends on python module pYsearch. See yahoo_appid in config.py for instructions.
+-search Work on all pages that are found in a MediaWiki search + across all namespaces. + -google Work on all pages that are found in a Google search. You need a Google Web API license key. Note that Google doesn't give out license keys anymore. See google_key in @@ -290,6 +293,15 @@ yield wikipedia.Page(site, pagenameofthelink) offset += step
+def SearchPageGenerator(query, number = 100, namespaces = None, site = None): + """ + Provides a list of results using the internal MediaWiki search engine + """ + if site is None: + site = wikipedia.getSite() + for page in site.search(query, number=number, namespaces = namespaces): + yield page[0] + class YahooSearchPageGenerator: ''' To use this generator, install pYsearch @@ -745,6 +757,14 @@ gen = NewpagesPageGenerator(number = int(arg[5:])) else: gen = NewpagesPageGenerator(number = 60) + elif arg.startswith('-search'): + if len(arg) == 8: + mediawikiQuery = wikipedia.input(u'What do you want to search for?') + else: + mediawikiQuery = arg[8:] + # In order to be useful, all namespaces are required + gen = SearchPageGenerator(mediawikiQuery, namespaces = []) + elif arg.startswith('-google'): if len(arg) == 7: googleQuery = wikipedia.input(u'What do you want to search for?')
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2007-10-03 14:55:17 UTC (rev 4404) +++ trunk/pywikipedia/wikipedia.py 2007-10-03 16:48:25 UTC (rev 4405) @@ -3666,6 +3666,44 @@ except KeyError: return False
+ def search(self, query, number = 10, repeat = False, namespaces = None): + """ + Generator which yields search results + """ + seen = set() + throttle = True + while True: + path = self.search_address(query, n=number, ns = namespaces) + get_throttle() + html = self.getUrl(path) + entryR = re.compile(ur'<li[^>]*><a href=".+?" title="(?P<title>.+?)">.+?</a>' + '(?P<match>.*?)<br ?/><span[^>]*>Relevance: ' + '(?P<relevance>[0-9.]+)% - ' + '(?P<size>[0-9.]+) ' + '(?P<sizeunit>[A-Za-z]+) ' + '((?P<words>.+?) words) - ' + '(?P<date>.+?)</span></li>', re.DOTALL) + + for m in entryR.finditer(html): + title = m.group('title') + + if title not in seen: + seen.add(title) + page = Page(self, title) + + match = m.group('match') + relevance = m.group('relevance') + size = m.group('size') + # sizeunit appears to always be "KB" + words = m.group('words') + date = m.group('date') + + #print "%s - %s %s (%s words) - %s" % (relevance, size, sizeunit, words, date) + + yield page, match, relevance, size, words, date + if not repeat: + break + # TODO: avoid code duplication for the following methods def newpages(self, number = 10, get_redirect = False, repeat = False): """Generator which yields new articles subsequently. @@ -4213,6 +4251,9 @@ if self.encoding().lower() != charset.lower(): raise ValueError("code2encodings has wrong charset for %s. It should be %s, but is %s" % (repr(self), charset, self.encoding()))
+ def search_address(self, q, n=50, ns = 0): + return self.family.search_address(self.lang, q, n, ns) + def allpages_address(self, s, ns = 0): return self.family.allpages_address(self.lang, start = s, namespace = ns)