jenkins-bot has submitted this change and it was merged.
Change subject: Archivebot: support all languages and timezones. ......................................................................
Archivebot: support all languages and timezones.
Necessary info are retrieved directly from site. No use of locale should be necessary any longer.
Change-Id: Iede5165fd36b8e5747db032183094fa11177b037 --- M archivebot.py 1 file changed, 257 insertions(+), 172 deletions(-)
Approvals: Merlijn van Deen: Looks good to me, approved jenkins-bot: Verified
diff --git a/archivebot.py b/archivebot.py index f7de303..9c5603f 100644 --- a/archivebot.py +++ b/archivebot.py @@ -36,7 +36,7 @@ algo specifies the maximum age of a thread. Must be in the form old(<delay>) where <delay> specifies the age in hours or days like 24h or 5d. - Default ist old(24h) + Default is old(24h) counter The current value of a counter which could be assigned as variable. Will be actualized by bot. Initial value is 1. maxarchivesize The maximum archive size before incrementing the counter. @@ -70,20 +70,28 @@ # # (C) Misza13, 2006-2010 # (C) xqt, 2009-2012 -# (C) Pywikipedia bot team, 2007-2012 +# (C) Pywikipedia bot team, 2007-2013 # # Distributed under the terms of the MIT license. # __version__ = '$Id$' # +import wikipedia as pywikibot +from pywikibot import i18n +import pagegenerators +import query +import datetime +import time import os import re -import time import locale import traceback -import string -import urllib -import unicodedata + + +ZERO = datetime.timedelta(0) + +Site = pywikibot.getSite() + try: # Get a constructor for the MD5 hash object import hashlib new_hash = hashlib.md5 @@ -91,13 +99,6 @@ import md5 new_hash = md5.md5
-import wikipedia as pywikibot -from pywikibot import i18n -import pagegenerators -import query - - -Site = pywikibot.getSite() language = Site.language()
@@ -111,9 +112,7 @@
class MissingConfigError(pywikibot.Error): """The config is missing in the header (either it's in one of the threads - or transcluded from another page). - - """ + or transcluded from another page)."""
class AlgorithmError(MalformedConfigError): @@ -122,24 +121,20 @@
class ArchiveSecurityError(pywikibot.Error): """Archive is not a subpage of page being archived and key not specified - (or incorrect). - - """ + (or incorrect)."""
def str2time(str): """Accepts a string defining a time period: 7d - 7 days 36h - 36 hours - Returns the corresponding time, measured in seconds. - - """ + Returns the corresponding timedelta object.""" if str[-1] == 'd': - return int(str[:-1]) * 24 * 3600 + return datetime.timedelta(days=int(str[:-1])) elif str[-1] == 'h': - return int(str[:-1]) * 3600 + return datetime.timedelta(hours=int(str[:-1])) else: - return int(str) + return datetime.timedelta(seconds=int(str))
def str2size(str): @@ -148,10 +143,8 @@ 150K - 150 kilobytes 2M - 2 megabytes Returns a tuple (size,unit), where size is an integer and unit is - 'B' (bytes) or 'T' (threads). - - """ - if str[-1] in string.digits: # TODO: de-uglify + 'B' (bytes) or 'T' (threads).""" + if str[-1].isdigit(): # TODO: de-uglify return (int(str), 'B') elif str[-1] in ['K', 'k']: return (int(str[:-1]) * 1024, 'B') @@ -161,43 +154,6 @@ return (int(str[:-1]), 'T') else: return (int(str[:-1]) * 1024, 'B') - - -def int2month(num): - """Returns the locale's full name of month 'num' (1-12).""" - if hasattr(locale, 'nl_langinfo'): - return locale.nl_langinfo(locale.MON_1 + num - 1).decode('utf-8') - Months = ['january', 'february', 'march', 'april', 'may_long', 'june', - 'july', 'august', 'september', 'october', 'november', 'december'] - return Site.mediawiki_message(Months[num - 1]) - - -def int2month_short(num): - """Returns the locale's abbreviated name of month 'num' (1-12).""" - if hasattr(locale, 'nl_langinfo'): - #filter out non-alpha characters - return ''.join([c for c in - locale.nl_langinfo( - locale.ABMON_1 + num - 1).decode('utf-8') - if c.isalpha()]) - Months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', - 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] - return Site.mediawiki_message(Months[num - 1]) - - -def txt2timestamp(txt, format): - """Attempts to convert the timestamp 'txt' according to given 'format'. - On success, returns the time tuple; on failure, returns None. - - """ -## print txt, format - try: - return time.strptime(txt, format) - except ValueError: - try: - return time.strptime(txt.encode('utf8'), format) - except: - pass
def generateTransclusions(Site, template, namespaces=[]): @@ -211,20 +167,215 @@ yield page
+class Months(object): + """ + Generation of look-up dictionaries for months, used by Timestripper() and PageArchiver + """ + + def __init__(self, site=None): + if site is None: + self.site = pywikibot.getSite() + else: + self.site = site + + @classmethod + def queryMonths(self): + months_long = ['january', 'february', 'march', 'april', 'may_long', 'june', + 'july', 'august', 'september', 'october', 'november', 'december'] + months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', + 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] + + #one query instead of multiple queries using site.mediawiki_message() + #can be refactored to use site.mediawiki_message() + params = { + 'action': 'query', + 'meta': 'allmessages', + 'ammessages': '|'.join(months_long) + '|' + '|'.join(months_short), + 'amlang': self.site.lang, + } + + monthsDict = query.GetData(params)['query']['allmessages'] + + monthNum2origNames = dict((i, {'short': '', 'long': ''}) for i in range(1, 13)) + origNames2monthNum = dict() + + for el in monthsDict: + orig, eng = el['*'], el['name'] + try: + month_num = months_long.index(eng) + 1 + monthNum2origNames[month_num]['long'] = orig + except ValueError: + month_num = months_short.index(eng) + 1 + monthNum2origNames[month_num]['short'] = orig + + origNames2monthNum[orig] = month_num + + return monthNum2origNames, origNames2monthNum + + @classmethod + def updateMonths(self, site=None): + if site is None: + self.site = pywikibot.getSite() + else: + self.site = site + self.monthsDicts = self.queryMonths() + + +class tzoneUTC(datetime.tzinfo): + """ + Class building a UTC tzinfo object + """ + + def utcoffset(self, dt): + return ZERO + + def tzname(self, dt): + return 'UTC' + + def dst(self, dt): + return ZERO + + def __repr__(self): + return "%s()" % self.__class__.__name__ + + +class tzoneFixedOffset(datetime.tzinfo): + """ + Class building tzinfo objects for fixed-offset time zones + + @offset: a number indicating fixed offset in minutes east from UTC + @name: a string with name of the timezone""" + + def __init__(self, offset, name): + self.__offset = datetime.timedelta(minutes=offset) + self.__name = name + + def utcoffset(self, dt): + return self.__offset + + def tzname(self, dt): + return self.__name + + def dst(self, dt): + return ZERO + + def __repr__(self): + return "%s(%s, %s)" % ( + self.__class__.__name__, + self.__offset.days * 86400 + self.__offset.seconds, + self.__name + ) + + +class TimeStripper(object): + """ + Find timetstamp in page text and returns it as timezone aware datetime object + """ + + def __init__(self): + self.monthNum2origNames, self.origNames2monthNum = Months.monthsDicts + self.site = Months.site + + self.groups = [u'year', u'month', u'hour', u'time', u'day', u'minute', u'tzinfo'] + + timeR = r'(?P<time>(?P<hour>[0-2]\d)[:.h](?P<minute>[0-5]\d))' + timeznR = r'((?P<tzinfo>[A-Z]+))' + yearR = r'(?P<year>(19|20)\d\d)' + monthR = ur'(?P<month>(%s))' % (u'|'.join(self.origNames2monthNum)) + dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))' + + self.ptimeR = re.compile(timeR) + self.timeznR = re.compile(timeznR) + self.yearR = re.compile(yearR) + self.pmonthR = re.compile(monthR, re.U) + self.pdayR = re.compile(dayR) + + #order is important to avoid mismatch when searching + self.patterns = [ + self.ptimeR, + self.timeznR, + self.yearR, + self.pmonthR, + self.pdayR, + ] + + def findmarker(self, text, base=u'@@', delta='@'): + # find a string which is not part of text + while base in text: + base += delta + return base + + def last_match_and_replace(self, txt, pat): + """ + Take the rightmost match, to prevent spurious earlier matches, and replace with marker + """ + m = None + for m in pat.finditer(txt): + pass + + if m: + marker = self.findmarker(txt) + txt = pat.sub(marker, txt) + return (txt, m.groupdict()) + else: + return (txt, None) + + def timestripper(self, line): + """ + Find timestamp in line and convert it to time zone aware datetime + """ + _line = line + #match date fields + dateDict = dict() + for pat in self.patterns: + line, matchDict = self.last_match_and_replace(line, pat) + if matchDict: + dateDict.update(matchDict) + + #all fields matched -> date valid + if all(g in dateDict for g in self.groups): + #remove 'time' key, now splitted in hour/minute and not needed by datetime + del dateDict['time'] + + #replace month name in original language with month number + try: + dateDict['month'] = self.origNames2monthNum[dateDict['month']] + except KeyError: + pywikibot.output(u'incorrect month name in page') + + #convert to integers + for k, v in dateDict.items(): + try: + dateDict[k] = int(v) + except ValueError: + pass + + #find timezone + dateDict['tzinfo'] = tzoneFixedOffset(self.site.siteinfo()['timeoffset'], + self.site.siteinfo()['timezone']) + + timestamp = datetime.datetime(**dateDict) + + else: + timestamp = None + + return timestamp + + class DiscussionThread(object): - """An object representing a discussion thread on a page, that is something - of the form: + """An object representing a discussion thread on a page, that is something of the form:
== Title of thread ==
Thread content here. ~~~~ :Reply, etc. ~~~~ - """
- def __init__(self, title): + def __init__(self, title, now): self.title = title + self.now = now self.content = "" + self.ts = TimeStripper() self.timestamp = None
def __repr__(self): @@ -234,86 +385,16 @@ def feedLine(self, line): if not self.content and not line: return - self.content += line + '\n' - #Update timestamp -# nnwiki: -# 19:42, 25 mars 2008 (CET) -# enwiki -# 16:36, 30 March 2008 (UTC) -# huwiki -# 2007. december 8., 13:42 (CET) - TM = re.search(r'(\d\d):(\d\d), (\d\d?) (\S+) (\d\d\d\d) (.*?)', line) - if not TM: - TM = re.search(r'(\d\d):(\d\d), (\S+) (\d\d?), (\d\d\d\d) (.*?)', - line) - if not TM: - TM = re.search(r'(\d{4}). (\S+) (\d\d?)., (\d\d:\d\d) (.*?)', - line) -# 18. apr 2006 kl.18:39 (UTC) -# 4. nov 2006 kl. 20:46 (CET) - if not TM: - TM = re.search(r'(\d\d?). (\S+) (\d\d\d\d) kl.\W*(\d\d):(\d\d) (.*?)', - line) -#3. joulukuuta 2008 kello 16.26 (EET) - if not TM: - TM = re.search(r'(\d\d?). (\S+) (\d\d\d\d) kello \W*(\d\d).(\d\d) (.*?)', - line) - if not TM: -# 14:23, 12. Jan. 2009 (UTC) - pat = re.compile(r'(\d\d):(\d\d), (\d\d?). (\S+).? (\d\d\d\d) ((?:UTC|CES?T))') - TM = pat.search(line) -# ro.wiki: 4 august 2012 13:01 (EEST) - if not TM: - TM = re.search(r'(\d\d?) (\S+) (\d\d\d\d) (\d\d):(\d\d) (.*?)', - line) -# Japanese: 2012年8月4日 (日) 13:01 (UTC) - if not TM: - TM = re.search(re.compile(u'(\d\d\d\d)年(\d\d?)月(\d\d?)日 (.) (\d\d):(\d\d) (.*?)'), - line) - if TM: - # Strip away all diacritics in the Mn ('Mark, non-spacing') category - # NFD decomposition splits combined characters (e.g. 'ä", - # LATIN SMALL LETTER A WITH DIAERESIS) into two entities: - # LATIN SMALL LETTER A and COMBINING DIAERESIS. The latter falls - # in the Mn category and is filtered out, resuling in 'a'. - _TM = ''.join(c for c in unicodedata.normalize('NFD', TM.group(0)) - if unicodedata.category(c) != 'Mn')
- TIME = txt2timestamp(_TM, "%d. %b %Y kl. %H:%M (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%Y. %B %d., %H:%M (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%d. %b %Y kl.%H:%M (%Z)") - if not TIME: - TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', _TM), - "%H:%M, %d %B %Y") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %d %b %Y (%Z)") - if not TIME: - TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', _TM), - "%H:%M, %d %b %Y") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %b %d %Y (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %B %d %Y (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %b %d, %Y (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %B %d, %Y (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%d. %Bta %Y kello %H.%M (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%d %B %Y %H:%M (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%Y年%B%d日 (%a) %H:%M (%Z)") - if not TIME: - TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', _TM), - "%H:%M, %d. %b. %Y") - if TIME: - self.timestamp = max(self.timestamp, time.mktime(TIME)) -## pywikibot.output(u'Time to be parsed: %s' % TM.group(0)) -## pywikibot.output(u'Parsed time: %s' % TIME) -## pywikibot.output(u'Newest timestamp in thread: %s' % TIME) + self.content += line + '\n' + + timestamp = self.ts.timestripper(line) + + if not self.timestamp: # first time + self.timestamp = timestamp + + if timestamp: + self.timestamp = max(self.timestamp, timestamp)
def size(self): return len(self.title) + len(self.content) + 12 @@ -330,16 +411,14 @@ #TODO: handle this: #return 'unsigned' maxage = str2time(reT.group(1)) - if self.timestamp + maxage < time.time(): + if self.now - self.timestamp > maxage: return message('archivebot-older-than') + ' ' + reT.group(1) return ''
class DiscussionPage(pywikibot.Page): """A class that represents a single discussion page as well as an archive - page. Feed threads to it and run an update() afterwards. - - """ + page. Feed threads to it and run an update() afterwards."""
def __init__(self, title, archiver, vars=None): pywikibot.Page.__init__(self, Site, title) @@ -347,6 +426,8 @@ self.full = False self.archiver = archiver self.vars = vars + self.now = datetime.datetime.utcnow().replace(tzinfo=tzoneUTC()) + try: self.loadPage() except pywikibot.NoPage: @@ -370,7 +451,7 @@ found = True # Reading threads now if curThread: self.threads.append(curThread) - curThread = DiscussionThread(threadHeader.group(1)) + curThread = DiscussionThread(threadHeader.group(1), self.now) else: if found: curThread.feedLine(line) @@ -431,6 +512,7 @@ } self.archives = {} self.archivedThreads = 0 + self.monthNum2origNames, self.origNames2monthNum = Months.monthsDicts
def get(self, attr, default=''): return self.attributes.get(attr, [default])[0] @@ -445,10 +527,10 @@ and a != 'maxage']
def attr2text(self): - return '{{%s\n%s\n}}' % (self.tpl, - '\n'.join(['|%s = %s ' - % (a, self.get(a)) - for a in self.saveables()])) + return '{{%s\n%s\n}}' \ + % (self.tpl, + '\n'.join(['|%s = %s' % (a, self.get(a)) + for a in self.saveables()]))
def key_ok(self): s = new_hash() @@ -480,8 +562,7 @@ if not archive: return if not self.force \ - and not self.Page.title() + '/' == archive[ - :len(self.Page.title()) + 1] \ + and not self.Page.title() + '/' == archive[:len(self.Page.title()) + 1] \ and not self.key_ok(): raise ArchiveSecurityError if not archive in self.archives: @@ -493,7 +574,6 @@ archCounter = int(self.get('counter', '1')) oldthreads = self.Page.threads self.Page.threads = [] - T = time.mktime(time.gmtime()) whys = [] pywikibot.output(u'Processing %d threads' % len(oldthreads)) for t in oldthreads: @@ -506,14 +586,13 @@ why = t.shouldBeArchived(self) if why: archive = self.get('archive') - TStuple = time.gmtime(t.timestamp) vars = { 'counter': archCounter, - 'year': TStuple[0], - 'month': TStuple[1], - 'monthname': int2month(TStuple[1]), - 'monthnameshort': int2month_short(TStuple[1]), - 'week': int(time.strftime('%W', TStuple)), + 'year': t.timestamp.year, + 'month': t.timestamp.month, + 'monthname': self.monthNum2origNames[t.timestamp.month]['long'], + 'monthnameshort': self.monthNum2origNames[t.timestamp.month]['short'], + 'week': int(time.strftime('%W', t.timestamp.timetuple())), } archive = pywikibot.Page(Site, archive % vars).title() if self.feedArchive(archive, t, maxArchSize, vars): @@ -545,12 +624,12 @@ self.commentParams) self.archives[a].update(comment)
- #Save the page itself - rx = re.compile('{{%s\n.*?\n}}' % self.tpl, re.DOTALL) + # Save the page itself + rx = re.compile('{{' + self.tpl + '\n.*?\n}}', re.DOTALL) self.Page.header = rx.sub(self.attr2text(), self.Page.header) self.commentParams['count'] = self.archivedThreads - self.commentParams['archives'] = ', '.join( - ['[[%s]]' % a.title() for a in self.archives.values()]) + self.commentParams['archives'] \ + = ', '.join(['[[' + a.title() + ']]' for a in self.archives.values()]) if not self.commentParams['archives']: self.commentParams['archives'] = '/dev/null' self.commentParams['why'] = ', '.join(whys) @@ -633,9 +712,13 @@ pywikibot.showHelp('archivebot') return
+ #query site for original months name and create convenience look-up dictionaries + Months.updateMonths(site=Site) + for a in args: pagelist = [] if not options.filename and not options.pagename: + #for pg in pywikibot.Page(Site,a).getReferences(follow_redirects=False,onlyTemplateInclusion=True): if options.namespace is not None: ns = [str(options.namespace)] else: @@ -648,7 +731,10 @@ if options.pagename: pagelist.append(pywikibot.Page(Site, options.pagename, defaultNamespace=3)) + pagelist = sorted(pagelist) + #if not options.namespace == None: + # pagelist = [pg for pg in pagelist if pg.namespace()==options.namespace] for pg in iter(pagelist): pywikibot.output(u'Processing %s' % pg) # Catching exceptions, so that errors in one page do not bail out @@ -660,7 +746,6 @@ except: pywikibot.output(u'Error occured while processing page %s' % pg) traceback.print_exc() -
if __name__ == '__main__': try:
pywikibot-commits@lists.wikimedia.org