SVN: [10149] trunk/pywikipedia/archivebot.py - Pywikipedia-svn

21 Apr 2012

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10149

Revision: 10149
Author:   xqt
Date:     2012-04-21 13:16:19 +0000 (Sat, 21 Apr 2012)
Log Message:
-----------
make DiscussionPage a pywikibot.Page subclass
additional messages
TM regex sometimes does not match timezone name, should be omitted by finding the
timestamp
TODO: Unfortunately some month strings aren't recognized; must be changed.

Modified Paths:
--------------
    trunk/pywikipedia/archivebot.py

Modified: trunk/pywikipedia/archivebot.py
===================================================================

--- trunk/pywikipedia/archivebot.py	2012-04-21 10:26:58 UTC (rev 10148)
+++ trunk/pywikipedia/archivebot.py	2012-04-21 13:16:19 UTC (rev 10149)
@@ -157,7 +157,7 @@
 def txt2timestamp(txt, format):
     """Attempts to convert the timestamp 'txt' according to given
'format'.
     On success, returns the time tuple; on failure, returns None."""
-    #print txt, format
+##    print txt, format
     try:
         return time.strptime(txt,format)
     except ValueError:
@@ -234,17 +234,16 @@
             TM = re.search(r'(\d\d?)\. (\S+) (\d\d\d\d) kello \W*(\d\d).(\d\d)
\(.*?\)', line)
         if not TM:
 # 14:23, 12. Jan. 2009 (UTC)
-            pat = re.compile(r'(\d\d):(\d\d), (\d\d?)\. (\S+)\.? (\d\d\d\d)
\(UTC\)')
+            pat = re.compile(r'(\d\d):(\d\d), (\d\d?)\. (\S+)\.? (\d\d\d\d)
\((?:UTC|CES?T)\)')
             TM = pat.search(line)
         if TM:
-#            pywikibot.output(TM)
             TIME = txt2timestamp(TM.group(0),"%d. %b %Y kl. %H:%M (%Z)")
             if not TIME:
                 TIME = txt2timestamp(TM.group(0), "%Y. %B %d., %H:%M (%Z)")
             if not TIME:
                 TIME = txt2timestamp(TM.group(0),"%d. %b %Y kl.%H:%M (%Z)")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0),"%H:%M, %d %B %Y (%Z)")
+                TIME = txt2timestamp(re.sub(' *\([^ ]+\) *', '',
TM.group(0)),"%H:%M, %d %B %Y")
             if not TIME:
                 TIME = txt2timestamp(TM.group(0),"%H:%M, %d %b %Y (%Z)")
             if not TIME:
@@ -260,12 +259,12 @@
             if not TIME:
                 TIME = txt2timestamp(TM.group(0),"%d. %Bta %Y kello %H.%M
(%Z)")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0),"%H:%M, %d. %b. %Y (%Z)")
+                TIME = txt2timestamp(re.sub(' *\([^ ]+\) *', '',
TM.group(0)), "%H:%M, %d. %b. %Y")
             if TIME:
-                self.timestamp = max(self.timestamp,time.mktime(TIME))
-#                pywikibot.output(u'Time to be parsed: %s' % TM.group(0))
-#                pywikibot.output(u'Parsed time: %s' % TIME)
-#                pywikibot.output(u'Newest timestamp in thread: %s' % TIME)
+                self.timestamp = max(self.timestamp, time.mktime(TIME))
+##                pywikibot.output(u'Time to be parsed: %s' % TM.group(0))
+##                pywikibot.output(u'Parsed time: %s' % TIME)
+##                pywikibot.output(u'Newest timestamp in thread: %s' % TIME)
 
     def size(self):
         return len(self.title) + len(self.content) + 12
@@ -286,15 +285,15 @@
                 return message('archivebot-older-than') + ' ' +
reT.group(1)
         return ''
 
-class DiscussionPage(object):
+class DiscussionPage(pywikibot.Page):
     """A class that represents a single discussion page as well as an
archive
     page. Feed threads to it and run an update() afterwards."""
-    #TODO: Make it a subclass of pywikibot.Page
 
     def __init__(self, title, archiver, vars=None):
-        self.title = title
+        pywikibot.Page.__init__(self, Site, title, defaultNamespace=3)
+##        self.title = title
         self.threads = []
-        self.Page = pywikibot.Page(Site,self.title)
+##        self.Page = self
         self.full = False
         self.archiver = archiver
         self.vars = vars
@@ -312,7 +311,7 @@
         self.threads = []
         self.archives = {}
         self.archivedThreads = 0
-        lines = self.Page.get().split('\n')
+        lines = self.get().split('\n')
         state = 0 #Reading header
         curThread = None
         for line in lines:
@@ -329,6 +328,7 @@
                     self.header += line + '\n'
         if curThread:
             self.threads.append(curThread)
+        pywikibot.output(u'%d Threads found.' % len(self.threads))
 
     def feedThread(self, thread, maxArchiveSize=(250*1024,'B')):
         self.threads.append(thread)
@@ -353,7 +353,7 @@
             newtext += t.toText()
         if self.full:
             summary += ' ' + message('archivebot-archive-full')
-        self.Page.put(newtext, minorEdit=True, comment=summary)
+        self.put(newtext, minorEdit=True, comment=summary)
 
 class PageArchiver(object):
     """A class that encapsulates all archiving methods.
@@ -376,7 +376,7 @@
         self.Page = DiscussionPage(Page.title(),self)
         self.loadConfig()
         self.commentParams = {
-                'from' : self.Page.title,
+                'from' : self.Page.title(),
                 }
         self.archives = {}
         self.archivedThreads = 0
@@ -400,12 +400,12 @@
     def key_ok(self):
         s = new_hash()
         s.update(self.salt+'\n')
-        s.update(self.Page.title.encode('utf8')+'\n')
+        s.update(self.Page.title().encode('utf8')+'\n')
         return self.get('key') == s.hexdigest()
 
     def loadConfig(self):
         hdrlines = self.Page.header.split('\n')
-#        pywikibot.output(u'Looking for: %s' % self.tpl)
+        pywikibot.output(u'Looking for: %s in %s' % (self.tpl,
self.Page.title()))
         mode = 0
         for line in hdrlines:
             if mode == 0 and re.search('{{'+self.tpl,line):
@@ -419,7 +419,7 @@
                 continue
 
         if mode == 0 or not self.get('algo',''):
-            raise MissingConfigError
+            raise MissingConfigError(u'Missing od malformed template or missing
algo')
 
         #Last minute fix:
         self.set('archive', self.get('archive').replace('_','
'), True)
@@ -433,7 +433,7 @@
         if not archive:
             return
         if not self.force \
-           and not self.Page.title+'/' == archive[:len(self.Page.title)+1] \
+           and not self.Page.title()+'/' == archive[:len(self.Page.title())+1] \
            and not self.key_ok():
             raise ArchiveSecurityError
         if not archive in self.archives:
@@ -447,6 +447,7 @@
         self.Page.threads = []
         T = time.mktime(time.gmtime())
         whys = []
+        pywikibot.output(u'Processing %d threads' % len(oldthreads))
         for t in oldthreads:
             if len(oldthreads) - self.archivedThreads \
                <= int(self.get('minthreadsleft',5)):
@@ -477,12 +478,14 @@
         return set(whys)
 
     def run(self):
-        if not self.Page.Page.botMayEdit(Site.username):
+        if not self.Page.botMayEdit(Site.username):
             return
         whys = self.analyzePage()
         if self.archivedThreads < int(self.get('minthreadstoarchive',2)):
             # We might not want to archive a measly few threads
             # (lowers edit frequency)
+            pywikibot.output(u'There are only %d Threads. Skipping'
+                             % self.archivedThreads)
             return
         if whys:
             #Save the archives first (so that bugs don't cause a loss of data)
@@ -498,7 +501,7 @@
             self.Page.header = rx.sub(self.attr2text(),self.Page.header)
             self.commentParams['count'] = self.archivedThreads
             self.commentParams['archives'] \
-                = ', '.join(['[['+a.title+']]' for a in
self.archives.values()])
+                = ', '.join(['[['+a.title()+']]' for a in
self.archives.values()])
             if not self.commentParams['archives']:
                 self.commentParams['archives'] = '/dev/null'
             self.commentParams['why'] = ', '.join(whys)
@@ -588,8 +591,8 @@
         pagelist = sorted(pagelist)
         #if not options.namespace == None:
         #    pagelist = [pg for pg in pagelist if pg.namespace()==options.namespace]
-
-        for pg in pagelist:
+        for pg in iter(pagelist):
+            pywikibot.output(u'Processing %s' % pg)
             # Catching exceptions, so that errors in one page do not bail out
             # the entire process
             try: