http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11022
Revision: 11022 Author: drtrigon Date: 2013-02-01 20:12:01 +0000 (Fri, 01 Feb 2013) Log Message: ----------- bug fix; change r11020 solves "issue with r355" (a very old one - other repo) improvement; handling of binary sources (automatic instead of 'zip=True') improvement; metadata support introduced and thus error handling improved (changed from 'error' to 'verbose') improvement; switched from 'wiki=True' to 'url=wiki://...' syntax
Modified Paths: -------------- trunk/pywikipedia/subster.py
Modified: trunk/pywikipedia/subster.py =================================================================== --- trunk/pywikipedia/subster.py 2013-01-31 18:21:29 UTC (rev 11021) +++ trunk/pywikipedia/subster.py 2013-02-01 20:12:01 UTC (rev 11022) @@ -68,7 +68,7 @@ import re, sys, os, string, time, copy import difflib, traceback import BeautifulSoup -import StringIO, zipfile, csv, urllib +import StringIO, zipfile, csv import mailbox, mimetypes, datetime, email.utils import openpyxl.reader.excel import crontab @@ -83,10 +83,13 @@ from pywikibot.comms import http
-bot_config = { # unicode values +bot_config = { + # unicode values 'TemplateName': u'User:DrTrigonBot/Subster', # or 'template' for 'Flagged Revisions' - 'ErrorTemplate': u'----\n<b>SubsterBot Exception in "%s" (%s)</b>\n%s',
+ 'ErrorTemplate': u'<b>SubsterBot Exception in "%s" (%s)</b>\n<pre>%s</pre>', + 'VerboseMessage': u'<noinclude>\n----\n%s\n</noinclude>', # DRTRIGON-116, DRTRIGON-132 + # important to use a '.css' page here, since it HAS TO BE protected to # prevent malicious code injection ! 'ConfCSSpostproc': u'User:DrTrigon/DrTrigonBot/subster-postproc.css', @@ -109,16 +112,15 @@ 'count': '0', #'postproc': '("","")', 'postproc': '('', '')', - 'wiki': 'False', # may be change to url='wiki://' 'beautifulsoup': 'False', # DRTRIGON-88 - 'expandtemplates': 'False', # DRTRIGON-93 (only with 'wiki') + 'expandtemplates': 'False', # DRTRIGON-93 (with 'wiki://') 'simple': '', # DRTRIGON-85 'zip': 'False', 'xlsx': '', # 'ods': '', # # may be 'hours' have to be added too (e.g. for 'ar') 'cron': '', # DRTRIGON-102 - 'error': repr('<noinclude>\n%(error)s\n</noinclude>'), # DRTRIGON-116 + 'verbose': 'True', # DRTRIGON-132 (else see logs) #'djvu': ... u"djvused -e 'n' "%s"" ... djvutext.py #'pdf': ... u"pdftotext" or python module #'imageocr', 'swfocr', ... @@ -178,13 +180,13 @@ if not ((self.site.family.name == 'wikidata') and (self.site.lang == 'repo')): # DRTRIGON-130; skip this for test-repo self._code = self._ConfCSSpostprocPage.get() - pywikibot.output(u'Imported postproc %s rev %s from %s' % \ + pywikibot.output(u'Imported postproc %s rev %s from %s' %\ ((self._ConfCSSpostprocPage.title(asLink=True),) + self._ConfCSSpostprocPage.getVersionHistory(revCount=1)[0][:2]) ) self._flagenable = {} if self._ConfCSSconfigPage.exists(): exec(self._ConfCSSconfigPage.get()) # with variable: bot_config_wiki self._flagenable = bot_config_wiki['flagenable'] - pywikibot.output(u'Imported config %s rev %s from %s' % \ + pywikibot.output(u'Imported config %s rev %s from %s' %\ ((self._ConfCSSconfigPage.title(asLink=True),) + self._ConfCSSconfigPage.getVersionHistory(revCount=1)[0][:2]) )
def run(self, sim=False, msg=None, EditFlags=bot_config['EditFlags']): @@ -219,7 +221,7 @@ # convert talk page result to wikidata(base) data = self.WD_convertContent(substed_content) #outpage = page.toggleTalkPage() - outpage = pywikibot.wikidataPage(self.site, page.toggleTalkPage().title()) + outpage = pywikibot.DataPage(self.site, page.toggleTalkPage().title()) #dic = json.loads(outpage.get()) dic = outpage.getentities()
@@ -260,24 +262,54 @@ substituted and a list of those tags. """
+ #md_val_tag = u'%s-META-%s' + md_val_tag = u'META-%s-%s' + substed_content = content substed_tags = [] # DRTRIGON-73
for item in params: + # 1st stage: main/general content substitution # 1.) - 5.) subst templates + metadata = { 'bot-error': unicode(False), + 'bot-error-traceback': u'', } # DRTRIGON-132 try: - (substed_content, tags) = self.subTemplate(substed_content, item) + (substed_content, tags, md) = self.subTemplate(substed_content, item) substed_tags += tags + metadata.update(md) + + # DRTRIGON-132; metadata append IFF other data/content changed + # (can change all the time, but MUST NOT trigger a page save/change!) + if not tags: + metadata = {} except: exc_info = sys.exc_info() result = u''.join(traceback.format_exception(exc_info[0], exc_info[1], exc_info[2])) - substed_content += ast.literal_eval(item['error']) %\ - {'error': bot_config['ErrorTemplate'] %\ + + # DRTRIGON-132; metadata append IFF exception raised + # (this metadata HAVE TO trigger a change because of error!) + metadata['bot-error'] = unicode(True) + metadata['bot-error-traceback'] = bot_config['ErrorTemplate'] %\ ( item['value'], pywikibot.Timestamp.now().isoformat(' '), - u' ' + result.replace(u'\n', u'\n ').rstrip() ) } - substed_tags.append( u'>error:%s<' % item['value'] ) + result.strip() )
+ # VerboseMode: IFF no 'bot-error-traceback' metadata tag present on + # page, append it in order not to loose error info (single exception) + value = md_val_tag % (item['value'], 'bot-error-traceback') + tags = self.subTag(substed_content, value)[1] + if ast.literal_eval(item['verbose']) and (value not in tags): + substed_content += bot_config['VerboseMessage'] %\ + (self._var_regex_str % {'var': value, 'cont': u''}) + + # 2nd stage: conditional metadata substitution (DRTRIGON-132) + # (IFF content changed, exception raised, ...) + for data in metadata: + value = md_val_tag % (item['value'], data) + (substed_content, tags) = self.subTag(substed_content, value, metadata[data], 0) + substed_tags += tags + #substed_tags.append( u'>error:%s<' % item['value'] ) + return (substed_content, substed_tags)
def subTemplate(self, content, param): @@ -293,6 +325,8 @@ """
substed_tags = [] # DRTRIGON-73 + metadata = { 'mw-signature': u'~~~~', + 'mw-timestamp': u'~~~~~', } # DRTRIGON-132
# 0.2.) check for 'simple' mode and get additional params if param['simple']: @@ -312,20 +346,20 @@ pywikibot.output(u'CRON delay for execution: %.3f (<= %i)' % (delay, bot_config['CRONMaxDelay']))
if not (delay <= bot_config['CRONMaxDelay']): - return (content, substed_tags) + return (content, substed_tags, metadata)
# 1.) getUrl or wiki text # (security: check url not to point to a local file on the server, # e.g. 'file://' - same as used in xsalt.py) secure = False - for item in [u'http://', u'https://', u'mail://', u'local://']: + for item in [u'http://', u'https://', u'mail://', u'local://', u'wiki://']: secure = secure or (param['url'][:len(item)] == item) - param['wiki'] = ast.literal_eval(param['wiki']) param['zip'] = ast.literal_eval(param['zip']) - if (not secure) and (not param['wiki']): - return (content, substed_tags) - if param['wiki']: - if ast.literal_eval(param['expandtemplates']): # DRTRIGON-93 (only with 'wiki') + if not secure: + return (content, substed_tags, metadata) + if (param['url'][:7] == u'wiki://'): + param['url'] = param['url'][7:].strip('[]') # enable wiki-links + if ast.literal_eval(param['expandtemplates']): # DRTRIGON-93 (only with 'wiki://') external_buffer = pywikibot.Page(self.site, param['url']).get(expandtemplates=True) else: external_buffer = self.load( pywikibot.Page(self.site, param['url']) ) @@ -342,22 +376,28 @@ d.close() else: external_buffer = u'n/a' - elif param['zip']: - external_buffer = urllib.urlopen(param['url']).read() - # issue with r355: http://de.wikipedia.org/w/index.php?title=Vorlage:Infobox_Kreditinstitut/Dat... - #f_url, external_buffer = http.request(self.site, param['url'], no_hostname=True, back_response=True) - #external_buffer = f_url.read() - #del f_url # free some memory (no need to keep a copy...) else: - external_buffer = http.request(self.site, param['url'], no_hostname = True) + f_url, external_buffer = http.request(self.site, param['url'], + no_hostname = True, + back_response = True) + headers = f_url.headers # same like 'f_url.info()' + #if param['zip']: + if ('text/' not in headers['content-type']): + pywikibot.output(u'Source is of non-text content-type, using raw data instead.') + external_buffer = f_url.read() + del f_url # free some memory (no need to keep copy)
+ for h in ['content-length', 'date', 'last-modified', 'expires']: + if h in headers: + metadata['url-%s' % h] = headers[h] + # some intermediate processing (unzip, xlsx2csv, ...) - if param['zip']: + if param['zip']: # 'application/zip', ... fileno = 0 if (param['zip'] == True) else (param['zip']-1) external_buffer = self.unzip(external_buffer, fileno) - if param['xlsx']: + if param['xlsx']: # 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' external_buffer = self.xlsx2csv(external_buffer, param['xlsx']) - if param['ods']: + if param['ods']: # 'application/vnd.oasis.opendocument.spreadsheet' external_buffer = self.ods2csv(external_buffer, param['ods'])
if not ast.literal_eval(param['beautifulsoup']): # DRTRIGON-88 @@ -404,11 +444,8 @@ logging.getLogger('subster').debug( external_data )
# 5.) subst content - prev_content = content - var_regex = self.get_var_regex(value) - content = var_regex.sub((self._var_regex_str%{'var':value,'cont':external_data}), content, int(param['count'])) - if (content != prev_content): - substed_tags.append(value) + (content, tags) = self.subTag(content, value, external_data, int(param['count'])) + substed_tags += tags else: # DRTRIGON-105: Support for multiple BS template configurations value = param['value'] @@ -431,6 +468,24 @@ if (content != prev_content): substed_tags.append(value+'BS')
+ metadata['bot-timestamp'] = pywikibot.Timestamp.now().isoformat(' ') + + return (content, substed_tags, metadata) + + def subTag(self, content, value, external_data=u'~~~~', count=1): + """Substitute one single tag (of a template) in content. + + Can also be (ab)used to check for presence of a tag. + """ + substed_tags = [] + + # 5.) subst content + prev_content = content + var_regex = self.get_var_regex(value) + content = var_regex.sub((self._var_regex_str%{'var':value,'cont':external_data}), content, count) + if (content != prev_content): + substed_tags.append(value) + return (content, substed_tags)
def outputContentDiff(self, content, substed_content): @@ -513,7 +568,7 @@ # a redirect) (key, value) = map(string.strip, item.split('=')) for linked in outpage.searchentities(key): - outpage = pywikibot.wikidataPage(self.site, linked[u'id']) + outpage = pywikibot.DataPage(self.site, linked[u'id']) #attr = outpage.getentities() attr = linked if (u'aliases' in attr) and (key in attr[u'aliases']): @@ -664,8 +719,8 @@ content = []
for i, message in enumerate(self): - sender = message['from'] # Could possibly be None. - subject = message['subject'] # Could possibly be None. + sender = message['from'] # Could possibly be None. + subject = message['subject'] # Could possibly be None. timestmp = message['date'] # Could possibly be None.
if sender and url[1] in sender: