Revision: 4665 Author: filnik Date: 2007-12-09 15:01:53 +0000 (Sun, 09 Dec 2007)
Log Message: ----------- Adding comments, adding an example, adding link to botwiki if you need help
Modified Paths: -------------- trunk/pywikipedia/add_text.py
Modified: trunk/pywikipedia/add_text.py =================================================================== --- trunk/pywikipedia/add_text.py 2007-12-09 14:46:49 UTC (rev 4664) +++ trunk/pywikipedia/add_text.py 2007-12-09 15:01:53 UTC (rev 4665) @@ -32,6 +32,17 @@ -untagged Add text in the images that doesn't have any license template -always If used, the bot won't asked if it should add the text specified -up If used, put the text above and not below + +--- Example --- + +python add_text.py -start:! -summary:"Bot: Adding a template" -text:"{{Something}}" -except:"{{(?:[Tt]emplate:|)[Ss]omething" -up + +--- Credits and Help --- +This script has been written by Botwiki's stuff, if you want to help us +or you need some help regarding this script, you can find us here: + +* http://botwiki.sno.cc + """
# @@ -51,88 +62,63 @@ class NothingFound(wikipedia.Error): """ An exception indicating that a regex has return [] instead of results."""
+# Useful for the untagged function def pageText(url): - try: - request = urllib2.Request(url) - user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7' - request.add_header("User-Agent", user_agent) - response = urllib2.urlopen(request) - text = response.read() - response.close() - # When you load to many users, urllib2 can give this error. - except urllib2.HTTPError: - wikipedia.output(u"Server error. Pausing for 10 seconds... " + time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime()) ) - time.sleep(10) - request = urllib2.Request(url) - user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7' - request.add_header("User-Agent", user_agent) - response = urllib2.urlopen(request) - text = response.read() - response.close() - return text + """ Function to load HTML text of a URL """ + try: + request = urllib2.Request(url) + user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7' + request.add_header("User-Agent", user_agent) + response = urllib2.urlopen(request) + text = response.read() + response.close() + # When you load to many users, urllib2 can give this error. + except urllib2.HTTPError: + wikipedia.output(u"Server error. Pausing for 10 seconds... " + time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime()) ) + time.sleep(10) + request = urllib2.Request(url) + user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7' + request.add_header("User-Agent", user_agent) + response = urllib2.urlopen(request) + text = response.read() + response.close() + return text
def untaggedGenerator(untaggedProject, limit = 500): - lang = untaggedProject.split('.', 1)[0] - project = '.' + untaggedProject.split('.', 1)[1] - if lang == 'commons': - link = 'http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php?wikifam=commo...' - else: - link = 'http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php?wikilang=' + lang + '&wikifam=' + project + '&order=img_timestamp&max=' + str(limit) + '&ofs=0&max=' + str(limit) - text = pageText(link) - #print text - regexp = r"""<td valign='top' title='Name'><a href='http://.*?\..*?\.org/w/index\.php\?title=(.*?)'>.*?</a></td>""" - results = re.findall(regexp, text) - if results == []: - print link - raise NothingFound('Nothing found! Try to use the tool by yourself to be sure that it works!') - else: - for result in results: - yield wikipedia.Page(self.site, result) + """ Function to get the pages returned by this tool: http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php """ + lang = untaggedProject.split('.', 1)[0] + project = '.' + untaggedProject.split('.', 1)[1] + if lang == 'commons': + link = 'http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php?wikifam=commo...' + else: + link = 'http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php?wikilang=' + lang + '&wikifam=' + project + '&order=img_timestamp&max=' + str(limit) + '&ofs=0&max=' + str(limit) + text = pageText(link) + #print text + regexp = r"""<td valign='top' title='Name'><a href='http://.*?\.org/w/index\.php\?title=(.*?)'>.*?</a></td>""" + results = re.findall(regexp, text) + if results == []: + print link + raise NothingFound('Nothing found! Try to use the tool by yourself to be sure that it works!') + else: + for result in results: + yield wikipedia.Page(self.site, result)
-def newImages(limit): - # Search regular expression to find links like this (and the class attribute is optional too) - # class="new" title="Immagine:Soldatino2.jpg">Immagine:Soldatino2.jpg</a>" <span class="comment"> - url = "/w/index.php?title=Special:Log&type=upload&user=&page=&pattern=&limit=%d&offset=0" % int(limit) - site = wikipedia.getSite() - textrun = site.getUrl(url) - image_namespace = site.image_namespace() + ":" - regexp = r'(class="new" |)title="' + image_namespace + '(.*?).(\w\w\w|jpeg)">.*?</a>".*?<span class="comment">' - pos = 0 - done = list() - ext_list = list() - r = re.compile(regexp, re.UNICODE) - while 1: - m = r.search(textrun, pos) - if m == None: - wikipedia.output(u"\t\t>> All images checked. <<") - break - pos = m.end() - new = m.group(1) - im = m.group(2) - ext = m.group(3) - # This prevent pages with strange characters. They will be loaded without problem. - image = im + "." + ext - if new != '': - wikipedia.output(u"Skipping %s because it has been deleted." % image) - done.append(image) - if image not in done: - done.append(image) - yield wikipedia.Page(site, 'Image:%s' % image) - def main(): + # When a page is tagged as "really well written" it has a star in the interwiki links. + # This is a list of all the templates used (in regex format) to make the stars appear. starsList = ['link[ _]fa', 'link[ _]adq', 'enllaç[ _]ad', 'link[ _]ua', 'legătură[ _]af', 'destacado', 'ua', 'liên k[ _]t[ _]chọn[ _]lọc'] - summary = None - addText = None - regexSkip = None - generator = None - always = False - exceptUrl = False + # If none, the var is setted only for check purpose. + summary = None; addText = None; regexSkip = None + generator = None; always = False; exceptUrl = False + # Load a lot of default generators genFactory = pagegenerators.GeneratorFactory() errorCount = 0 + # Put the text above or below the text? up = False - + + # Loading the arguments for arg in wikipedia.handleArgs(): if arg.startswith('-text'): if len(arg) == 5: @@ -173,20 +159,23 @@ limit = wikipedia.input(u'How many images do you want to check?') else: limit = arg[11:] - generator = newImages(limit) + generator = pagegenerators.newImages(limit, wikipedia.getSite()) elif arg == '-always': always = True else: generator = genFactory.handleArg(arg)
site = wikipedia.getSite() + # /wiki/ is not always the right path in non-wiki projects pathWiki = site.family.nicepath(site.lang) + # Check if there are the minimal settings if not generator: raise NoEnoughData('You have to specify the generator you want to use for the script!') if not addText: raise NoEnoughData('You have to specify what text you want to add!') if not summary: summary = 'Bot: Adding %s' % addText + # Main Loop for page in generator: wikipedia.output(u'Loading %s...' % page.title()) try: @@ -197,6 +186,7 @@ except wikipedia.IsRedirectPage: wikipedia.output(u"%s is a redirect, skip!" % page.title()) continue + # Understand if the bot has to skip the page or not if regexSkip and exceptUrl: url = '%s%s' % (pathWiki, page.urlname()) result = re.findall(regexSkip, site.getUrl(url)) @@ -205,23 +195,30 @@ else: result = [] if result != []: - wikipedia.output(u'Exception! regex (or word) use with -except, is in the page. Skip!') + wikipedia.output(u'Exception! regex (or word) use with -except is in the page. Skip!') continue + # If not up, text put below if not up: newtext = text categoryNamespace = site.namespace(14) + # Getting the categories regexpCat = re.compile(r'[[((?:category|%s):.*?)]]' % categoryNamespace.lower(), re.I) categorieInside = regexpCat.findall(text) + # Deleting the categories newtext = wikipedia.removeCategoryLinks(newtext, site) + # Getting the interwiki interwikiInside = page.interwiki() interwikiList = list() for paginetta in interwikiInside: nome = str(paginetta).split('[[')[1].split(']]')[0] interwikiList.append(nome) lang = nome.split(':')[0] + # Removing the interwiki newtext = wikipedia.removeLanguageLinks(newtext, site) + # Sorting the interwiki interwikiList.sort() newtext += "\n%s" % addText + # Reputting the categories for paginetta in categorieInside: try: newtext += '\n[[%s]]' % paginetta.decode('utf-8') @@ -231,6 +228,7 @@ except UnicodeEncodeError: newtext += '\n[[%s]]' % paginetta newtext += '\n' + # Dealing the stars' issue starsListInPage = list() for star in starsList: regex = re.compile('({{(?:template:|)%s|.*?}}\n)' % star, re.I) @@ -239,6 +237,7 @@ newtext = regex.sub('', newtext) for element in risultato: newtext += '\n%s' % element + # Adding the interwiki for paginetta in interwikiList: try: newtext += '\n[[%s]]' % paginetta.decode('utf-8') @@ -247,11 +246,13 @@ newtext += '\n[[%s]]' % paginetta.decode('Latin-1') except UnicodeEncodeError: newtext += '\n[[%s]]' % paginetta + # If instead the text must be added above... else: newtext = addText + '\n' + text wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) wikipedia.showDiff(text, newtext) choice = '' + # Let's put the changes. while 1: if not always: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
pywikipedia-l@lists.wikimedia.org