[Pywikipedia-l] SVN: [4405] trunk/pywikipedia - pywikibot

3 Oct 2007

Revision: 4405
Author:   leogregianin
Date:     2007-10-03 16:48:25 +0000 (Wed, 03 Oct 2007)
Log Message:
-----------
Patch 1800925: wikipedia.search and page generator by John Vandenberg (NOTE: I didn't obtain to function this)
Modified Paths:
--------------
    trunk/pywikipedia/family.py
    trunk/pywikipedia/pagegenerators.py
    trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/family.py
===================================================================

--- trunk/pywikipedia/family.py	2007-10-03 14:55:17 UTC (rev 4404)
+++ trunk/pywikipedia/family.py	2007-10-03 16:48:25 UTC (rev 4405)
@@ -2613,6 +2613,30 @@
     def api_address(self, code):
         return '%s?' % self.apipath(code)
+    def search_address(self, code, query, limit=100, namespaces = None):
+        """
+        Constructs a URL for searching using Special:Search
+        'namespaces' may be an int or a list; an empty list selects
+        all namespaces.  Defaults to namespace 0
+        """
+        namespace_params = ''
+        if namespaces is not None:
+            if isinstance(namespaces, int):
+                namespace_params = "&ns%d=1" % namespaces
+            elif isinstance (namespaces, list):
+                if len(namespaces) == 0:
+                    # add all namespaces
+                    namespaces = self.namespaces.keys()
+                for i in namespaces:
+                    if i > 0:
+                        namespace_params = namespace_params + '&ns%d=1' % i
+
+        return "%s?title=%s:Search&search=%s&limit=%d%s" % (self.path(code),
+                                                            self.special_namespace_url(code),
+                                                            query,
+                                                            limit,
+                                                            namespace_params)
+
     def allpages_address(self, code, start, namespace = 0):
         if self.version(code)=="1.2":
             return '%s?title=%s:Allpages&printable=yes&from=%s' % (
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py	2007-10-03 14:55:17 UTC (rev 4404)
+++ trunk/pywikipedia/pagegenerators.py	2007-10-03 16:48:25 UTC (rev 4405)
@@ -36,6 +36,9 @@
                   Depends on python module pYsearch.  See yahoo_appid in
                   config.py for instructions.
+-search           Work on all pages that are found in a MediaWiki search
+                  across all namespaces.
+
 -google           Work on all pages that are found in a Google search.
                   You need a Google Web API license key. Note that Google
                   doesn't give out license keys anymore. See google_key in
@@ -290,6 +293,15 @@
                 yield wikipedia.Page(site, pagenameofthelink)
         offset += step
+def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
+    """
+    Provides a list of results using the internal MediaWiki search engine
+    """
+    if site is None:
+        site = wikipedia.getSite()
+    for page in site.search(query, number=number, namespaces = namespaces):
+        yield page[0]
+
 class YahooSearchPageGenerator:
     '''
     To use this generator, install pYsearch
@@ -745,6 +757,14 @@
               gen = NewpagesPageGenerator(number = int(arg[5:]))
             else:
               gen = NewpagesPageGenerator(number = 60)
+        elif arg.startswith('-search'):
+            if len(arg) == 8:
+                mediawikiQuery = wikipedia.input(u'What do you want to search for?')
+            else:
+                mediawikiQuery = arg[8:]
+            # In order to be useful, all namespaces are required
+            gen = SearchPageGenerator(mediawikiQuery, namespaces = [])
+
         elif arg.startswith('-google'):
             if len(arg) == 7:
                 googleQuery = wikipedia.input(u'What do you want to search for?')
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2007-10-03 14:55:17 UTC (rev 4404)
+++ trunk/pywikipedia/wikipedia.py	2007-10-03 16:48:25 UTC (rev 4405)
@@ -3666,6 +3666,44 @@
         except KeyError:
             return False
+    def search(self, query, number = 10, repeat = False, namespaces = None):
+        """
+        Generator which yields search results
+        """
+        seen = set()
+        throttle = True
+        while True:
+            path = self.search_address(query, n=number, ns = namespaces)
+            get_throttle()
+            html = self.getUrl(path)
+            entryR = re.compile(ur'<li[^>]*><a href=".+?" title="(?P<title>.+?)">.+?</a>'
+                                  '(?P<match>.*?)<br ?/><span[^>]*>Relevance: '
+                                  '(?P<relevance>[0-9.]+)% - '
+                                  '(?P<size>[0-9.]+) '
+                                  '(?P<sizeunit>[A-Za-z]+) '
+                                  '((?P<words>.+?) words) - '
+                                  '(?P<date>.+?)</span></li>', re.DOTALL)
+
+            for m in entryR.finditer(html):
+                title = m.group('title')
+
+                if title not in seen:
+                    seen.add(title)
+                    page = Page(self, title)
+
+                    match = m.group('match')
+                    relevance = m.group('relevance')
+                    size = m.group('size')
+                    # sizeunit appears to always be "KB"
+                    words = m.group('words')
+                    date = m.group('date')
+
+                    #print "%s - %s %s (%s words) - %s" % (relevance, size, sizeunit, words, date)
+
+                    yield page, match, relevance, size, words, date
+            if not repeat:
+                break
+
     # TODO: avoid code duplication for the following methods
     def newpages(self, number = 10, get_redirect = False, repeat = False):
         """Generator which yields new articles subsequently.
@@ -4213,6 +4251,9 @@
         if self.encoding().lower() != charset.lower():
             raise ValueError("code2encodings has wrong charset for %s. It should be %s, but is %s" % (repr(self), charset, self.encoding()))
+    def search_address(self, q, n=50, ns = 0):
+        return self.family.search_address(self.lang, q, n, ns)
+
     def allpages_address(self, s, ns = 0):
         return self.family.allpages_address(self.lang, start = s, namespace = ns)