http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10149
Revision: 10149 Author: xqt Date: 2012-04-21 13:16:19 +0000 (Sat, 21 Apr 2012) Log Message: ----------- make DiscussionPage a pywikibot.Page subclass additional messages TM regex sometimes does not match timezone name, should be omitted by finding the timestamp TODO: Unfortunately some month strings aren't recognized; must be changed.
Modified Paths: -------------- trunk/pywikipedia/archivebot.py
Modified: trunk/pywikipedia/archivebot.py =================================================================== --- trunk/pywikipedia/archivebot.py 2012-04-21 10:26:58 UTC (rev 10148) +++ trunk/pywikipedia/archivebot.py 2012-04-21 13:16:19 UTC (rev 10149) @@ -157,7 +157,7 @@ def txt2timestamp(txt, format): """Attempts to convert the timestamp 'txt' according to given 'format'. On success, returns the time tuple; on failure, returns None.""" - #print txt, format +## print txt, format try: return time.strptime(txt,format) except ValueError: @@ -234,17 +234,16 @@ TM = re.search(r'(\d\d?). (\S+) (\d\d\d\d) kello \W*(\d\d).(\d\d) (.*?)', line) if not TM: # 14:23, 12. Jan. 2009 (UTC) - pat = re.compile(r'(\d\d):(\d\d), (\d\d?). (\S+).? (\d\d\d\d) (UTC)') + pat = re.compile(r'(\d\d):(\d\d), (\d\d?). (\S+).? (\d\d\d\d) ((?:UTC|CES?T))') TM = pat.search(line) if TM: -# pywikibot.output(TM) TIME = txt2timestamp(TM.group(0),"%d. %b %Y kl. %H:%M (%Z)") if not TIME: TIME = txt2timestamp(TM.group(0), "%Y. %B %d., %H:%M (%Z)") if not TIME: TIME = txt2timestamp(TM.group(0),"%d. %b %Y kl.%H:%M (%Z)") if not TIME: - TIME = txt2timestamp(TM.group(0),"%H:%M, %d %B %Y (%Z)") + TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', TM.group(0)),"%H:%M, %d %B %Y") if not TIME: TIME = txt2timestamp(TM.group(0),"%H:%M, %d %b %Y (%Z)") if not TIME: @@ -260,12 +259,12 @@ if not TIME: TIME = txt2timestamp(TM.group(0),"%d. %Bta %Y kello %H.%M (%Z)") if not TIME: - TIME = txt2timestamp(TM.group(0),"%H:%M, %d. %b. %Y (%Z)") + TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', TM.group(0)), "%H:%M, %d. %b. %Y") if TIME: - self.timestamp = max(self.timestamp,time.mktime(TIME)) -# pywikibot.output(u'Time to be parsed: %s' % TM.group(0)) -# pywikibot.output(u'Parsed time: %s' % TIME) -# pywikibot.output(u'Newest timestamp in thread: %s' % TIME) + self.timestamp = max(self.timestamp, time.mktime(TIME)) +## pywikibot.output(u'Time to be parsed: %s' % TM.group(0)) +## pywikibot.output(u'Parsed time: %s' % TIME) +## pywikibot.output(u'Newest timestamp in thread: %s' % TIME)
def size(self): return len(self.title) + len(self.content) + 12 @@ -286,15 +285,15 @@ return message('archivebot-older-than') + ' ' + reT.group(1) return ''
-class DiscussionPage(object): +class DiscussionPage(pywikibot.Page): """A class that represents a single discussion page as well as an archive page. Feed threads to it and run an update() afterwards.""" - #TODO: Make it a subclass of pywikibot.Page
def __init__(self, title, archiver, vars=None): - self.title = title + pywikibot.Page.__init__(self, Site, title, defaultNamespace=3) +## self.title = title self.threads = [] - self.Page = pywikibot.Page(Site,self.title) +## self.Page = self self.full = False self.archiver = archiver self.vars = vars @@ -312,7 +311,7 @@ self.threads = [] self.archives = {} self.archivedThreads = 0 - lines = self.Page.get().split('\n') + lines = self.get().split('\n') state = 0 #Reading header curThread = None for line in lines: @@ -329,6 +328,7 @@ self.header += line + '\n' if curThread: self.threads.append(curThread) + pywikibot.output(u'%d Threads found.' % len(self.threads))
def feedThread(self, thread, maxArchiveSize=(250*1024,'B')): self.threads.append(thread) @@ -353,7 +353,7 @@ newtext += t.toText() if self.full: summary += ' ' + message('archivebot-archive-full') - self.Page.put(newtext, minorEdit=True, comment=summary) + self.put(newtext, minorEdit=True, comment=summary)
class PageArchiver(object): """A class that encapsulates all archiving methods. @@ -376,7 +376,7 @@ self.Page = DiscussionPage(Page.title(),self) self.loadConfig() self.commentParams = { - 'from' : self.Page.title, + 'from' : self.Page.title(), } self.archives = {} self.archivedThreads = 0 @@ -400,12 +400,12 @@ def key_ok(self): s = new_hash() s.update(self.salt+'\n') - s.update(self.Page.title.encode('utf8')+'\n') + s.update(self.Page.title().encode('utf8')+'\n') return self.get('key') == s.hexdigest()
def loadConfig(self): hdrlines = self.Page.header.split('\n') -# pywikibot.output(u'Looking for: %s' % self.tpl) + pywikibot.output(u'Looking for: %s in %s' % (self.tpl, self.Page.title())) mode = 0 for line in hdrlines: if mode == 0 and re.search('{{'+self.tpl,line): @@ -419,7 +419,7 @@ continue
if mode == 0 or not self.get('algo',''): - raise MissingConfigError + raise MissingConfigError(u'Missing od malformed template or missing algo')
#Last minute fix: self.set('archive', self.get('archive').replace('_',' '), True) @@ -433,7 +433,7 @@ if not archive: return if not self.force \ - and not self.Page.title+'/' == archive[:len(self.Page.title)+1] \ + and not self.Page.title()+'/' == archive[:len(self.Page.title())+1] \ and not self.key_ok(): raise ArchiveSecurityError if not archive in self.archives: @@ -447,6 +447,7 @@ self.Page.threads = [] T = time.mktime(time.gmtime()) whys = [] + pywikibot.output(u'Processing %d threads' % len(oldthreads)) for t in oldthreads: if len(oldthreads) - self.archivedThreads \ <= int(self.get('minthreadsleft',5)): @@ -477,12 +478,14 @@ return set(whys)
def run(self): - if not self.Page.Page.botMayEdit(Site.username): + if not self.Page.botMayEdit(Site.username): return whys = self.analyzePage() if self.archivedThreads < int(self.get('minthreadstoarchive',2)): # We might not want to archive a measly few threads # (lowers edit frequency) + pywikibot.output(u'There are only %d Threads. Skipping' + % self.archivedThreads) return if whys: #Save the archives first (so that bugs don't cause a loss of data) @@ -498,7 +501,7 @@ self.Page.header = rx.sub(self.attr2text(),self.Page.header) self.commentParams['count'] = self.archivedThreads self.commentParams['archives'] \ - = ', '.join(['[['+a.title+']]' for a in self.archives.values()]) + = ', '.join(['[['+a.title()+']]' for a in self.archives.values()]) if not self.commentParams['archives']: self.commentParams['archives'] = '/dev/null' self.commentParams['why'] = ', '.join(whys) @@ -588,8 +591,8 @@ pagelist = sorted(pagelist) #if not options.namespace == None: # pagelist = [pg for pg in pagelist if pg.namespace()==options.namespace] - - for pg in pagelist: + for pg in iter(pagelist): + pywikibot.output(u'Processing %s' % pg) # Catching exceptions, so that errors in one page do not bail out # the entire process try: