Revision: 5449 Author: nicdumz Date: 2008-05-28 10:37:23 +0000 (Wed, 28 May 2008)
Log Message: ----------- * Now skipping autocreated users (SUL)
* CODE CLEANUP !!!!! : ** don't compile a static regex several times in a while/for, compile it before. Similarly, don't string.lower() several times in several 'for', just define lower_str = string.lower()
** if test: var = True break else: var = False break #.... making it var = test break
** when you *know* that a boolean var has been affected, avoid 'if var == True:' : 'if var:' is enough
** using re.finditer instead of 'pos = 0;while 1: x = re.search(...); pos = x.end(); ...'
** We were yielding None entries in a generator, and then when calling the generator : 'for x in gen(): if x==None: continue; else: ...' .... Just don't raise None entries, it works the same and is simpler.
** When you know that some limits are induced by your code to your script, commenting for the devs "#won't work for v>50" is fine, but warning the end-user, in the documentation, or with some output, is even better.
** Adding several FIXMEs where I found wieird lines without being able to find a proper fix. Please take a look.
Modified Paths: -------------- trunk/pywikipedia/welcome.py
Modified: trunk/pywikipedia/welcome.py =================================================================== --- trunk/pywikipedia/welcome.py 2008-05-28 08:24:37 UTC (rev 5448) +++ trunk/pywikipedia/welcome.py 2008-05-28 10:37:23 UTC (rev 5449) @@ -45,7 +45,7 @@ This script understands the following command-line arguments:
-edit[:#] Define how many edits a new user needs to be welcomed - (default: 1) + (default: 1, max: 50)
-time[:#] Define how many seconds the bot sleeps before restart (default: 3600) @@ -374,48 +374,39 @@
def load_word_function(wsite, raw): """ This is a function used to load the badword and the whitelist.""" - list_loaded = list() - pos = 0 - # I search with a regex how many user have not the talk page - # and i put them in a list (i find it more easy and secure). - while 1: - regl = r"("|')(.*?)("|')(, |))" - page = re.compile(regl, re.UNICODE) - xl = page.search(raw, pos) - if xl == None: - if len(list_loaded) >= 1: - wikipedia.output(u'\nReal-time list loaded.') - return list_loaded - break - elif len(done) == 0: - wikipedia.output(u'There was no input on the real-time page.') - load_2 = False - continue - pos = xl.end() - badword = xl.group(2) - if badword not in list_loaded: - list_loaded.append(badword) + regl = r"(?:"|')(.*?)(?:"|')(?:, |))" + page = re.compile(regl, re.UNICODE)
+ list_loaded = page.findall(raw) + + if len(list_loaded) == 0: + wikipedia.output(u'There was no input on the real-time page.') + else: + wikipedia.output(u'\nReal-time list loaded.') + return list_loaded + def parselog(wsite, raw, talk, number): """ The function to load the users (only users who have a certain number of edits) """ + #FIXME : Why is there a need for this 'done' list ? done = list() - pos = 0 + + autocreated = wikipedia.mediawiki_message('newuserlog-autocreate-entry') + # I search with a regex how many user have not the talk page # and i put them in a list (i find it more easy and secure). - while 1: - # FIXME: That's the regex, if there are problems, take a look here. - - reg = r'(<a href="' + re.escape(wsite.path()) + r'?title=%s(?P<user>.*?)&(?:amp;|)action=(?:edit|editredlink|edit&redlink=1)"' % talk - p = re.compile(reg, re.UNICODE) - x = p.search(raw, pos) - if x == None: - if len(done) >= 1: - wikipedia.output(u'\nLoaded all users...') - break - elif len(done) == 0: - wikipedia.output(u'There is nobody to be welcomed...') - break - pos = x.end() + + # XXX: That's the regex, if there are problems, take a look here. + + reg = u'(<a href="' + re.escape(wsite.path()) + + u'?title=%s(?P<user>.*?)&(?:amp;|)action=(?:edit|editredlink|edit&redlink=1)"' % talk + + u'.*?) (?P<reason>.*?) </li>' + p = re.compile(reg, re.UNICODE) + + for x in p.finditer(raw): + #skip autocreated users (SUL) + if autocreated in x.group('reason'): + wikipedia.output(u'%s has been created automatically, skipping...') + continue username = x.group('user') if username not in done: done.append(username) @@ -425,7 +416,11 @@ con = '%sSpecial:Contributions/%s' % (pathWiki, userpage.urlname()) # Getting the contribs... contribs = wsite.getUrl(con) - contribnum = contribs.count('<li>') # It counts the first 50 edits but it shouldn't be a problem. + + #FIXME: It counts the first 50 edits + # if number > 50, it won't work + contribnum = contribs.count('<li>') + if contribnum >= number: wikipedia.output(u'%s has enough edits to be welcomed' % userpage.titleWithoutNamespace() ) # The user must be welcomed, return his data. @@ -433,15 +428,19 @@ elif contribnum < number: if contribnum == 0: wikipedia.output(u'%s has no contributions.' % userpage.titleWithoutNamespace() ) - # That user mustn't be welcomed, return None. - yield None else: wikipedia.output(u'%s has only %s contributions.' % (userpage.titleWithoutNamespace(), str(contribnum)) ) - # That user mustn't be welcomed, return None. - yield None + # That user mustn't be welcomed. + continue
+ if len(done) == 0: + wikipedia.output(u'There is nobody to be welcomed...') + else: + wikipedia.output(u'\nLoaded all users...') + def report(wsite, rep_page, username, com, rep): """ The function to report the username to a wiki-page. """ + another_page = wikipedia.Page(wsite, rep_page) if another_page.exists(): text_get = another_page.get() @@ -482,7 +481,7 @@ """ Function to load the random signatures. """ reg = r"^* ?(.*?)$" creg = re.compile(reg, re.M) - if fileOption == False: + if not fileOption: signPage = wikipedia.Page(wsite, signPageTitle) signText = signPage.get() else: @@ -696,7 +695,7 @@ welcomer = u'{{subst:Benvenuto}} %s'
welcomed_users = list() - if savedata == True and os.path.exists( + if savedata and os.path.exists( wikipedia.config.datafilepath(filename)): f = file(filename) number_user = cPickle.load(f) @@ -707,7 +706,7 @@
# Here there is the main loop. while True: - if filter_wp == True: + if filter_wp: # A standard list of bad username components (you can change/delate it in your project...). # [ I divided the list into three to make it smaller...] elencoaf = [' ano', ' anus', 'anal ', 'babies', 'baldracca', 'balle', 'bastardo', @@ -756,7 +755,7 @@ elencovarie = list() # Joining the three lists.. elenco = elencoaf + elencogz + elencovarie - if filter_wp == True: + if filter_wp: # That is the default whitelist (it contains few name because it has been improved in the latest days..). whitelist_default = ['emiliano'] if wtlpg != None: @@ -791,7 +790,7 @@ log = wsite.getUrl(URL) wikipedia.output(u'Loading latest %s new users from %s...\n' % (limit, wsite.hostname())) # Determine which signature to use - if random == True: + if random: try: wikipedia.output(u'Loading random signatures...') signList = defineSign(wsite, signPageTitle, fileSignName, fileOption) @@ -799,10 +798,8 @@ wikipedia.output(u'The list with signatures is not available... Using default signature...') random = False for found_result in parselog(wsite, log, talk, number): - if found_result == None: - continue # Compiling the signature to be used. - if random == True: + if random: if number_user + 1 > len(signList): number_user = 0 yield number_user @@ -828,10 +825,12 @@ wikipedia.output(u'%s has been blocked! Skipping...' % usertalkpage.titleWithoutNamespace()) continue # Understand if the user has a bad-username. + username = str(username).encode(config.console_encoding) + lower_uname = username.lower() for word in elenco: - username = str(username).encode(config.console_encoding) - if word.lower() in username.lower(): + if word.lower() in lower_uname: baduser = True + # What's this ? Docu please. if wsite.lang == 'it': final_rep = "%s%s}}" % (rep_text, word) break @@ -839,40 +838,46 @@ final_rep = rep_text break # Checking in the whitelist... + + # FIXME I believe this is broken for xy in whitelist: - if xy.lower() in username.lower(): - username.replace(xy, '') + if xy.lower() in lower_uname: + # the next line does *not* change username + # besides replacing xy is useless if only xy.lower() + # is in username + lower_uname.replace(xy, '') for word in elenco: - if word.lower() in username.lower(): - baduser = True - break - else: - baduser = False - break + baduser = word.lower() in lower_uname: + break # He has a badusername, trying to report him... if baduser: + # FIXME : while 1 ? What the... ? Do we need a while here ? Documentation please. while 1: - if ask == True: + if ask: wikipedia.output(u'%s may have an unwanted username, what shall I do?' % usertalkpage.titleWithoutNamespace()) + + # FIXME : consider using inputChoice here + answer = wikipedia.input(u'[B]lock or [W]elcome?') for w in block: if w in answer: if not usertalkpage.exists(): # Check if the user has been already blocked (second check). - ki = blocked(wsite, username) - if ki == True: + if blocked(wsite, username): wikipedia.output(u'%s has been blocked! Skipping him...' % usertalkpage.titleWithoutNamespace()) + # FIXME: that continue will continue on "for w in block:". Do we really want to do this ? continue report(wsite, rep_page, username, com, final_rep) break else: wikipedia.output(u'The discussion page of the bad-user already exists...') running = False + #FIXME : Why don't we break here ? for w in say_hi: if w in answer: baduser = False break - elif ask == False: + else: wikipedia.output(u'%s is possibly an unwanted username. He will be reported.' % usertalkpage.titleWithoutNamespace()) if not usertalkpage.exists(): report(wsite, rep_page, username, com, final_rep) @@ -898,7 +903,7 @@ wikipedia.output(u'%s has been already welcomed when i was loading all the users... skipping' % usertalkpage.titleWithoutNamespace()) continue # That's the log - if log_variable == True and logg: + if log_variable and logg: if len(welcomed_users) == 1: wikipedia.output(u'One user has been welcomed.') elif len(welcomed_users) == 0: @@ -916,7 +921,7 @@ # If we haven't to report, do nothing. elif log_variable == False: pass - if log_variable == True and logg and len(welcomed_users) != 0: + if log_variable and logg and len(welcomed_users) != 0: if len(welcomed_users) == 1: wikipedia.output(u'Putting the log of the latest user...') else: @@ -926,7 +931,7 @@ if logresult2 == False: continue # If recursive, don't exit, repeat after one hour. - if recursive == True: + if recursive : waitstr = unicode(time_variable) if locale.getlocale()[1]: strfstr = unicode(time.strftime(u"%d %b %Y %H:%M:%S (UTC)", time.gmtime()), locale.getlocale()[1]) @@ -936,6 +941,7 @@ time.sleep(time_variable) # If not recursive, break. elif recursive == False: + #FIXME : others "yields" yield a single integer. Why are we doing this here ? 'STOP' is not even being retrieved yield [number_user, 'STOP']
if __name__ == "__main__": @@ -949,6 +955,9 @@ random = settingsBot[11] savedata = settingsBot[13] # I need to know what is the number_user, in this way I get it. + #FIXME: Do we need to do this ? + # in other words, why can't main() return a SINGLE value, + # an integer, number_user ? for x in main(settingsBot): try: number_user = x[0] @@ -957,11 +966,12 @@ else: break except wikipedia.BadTitle: + #FIXME : This kind of error should be catched earlier. wikipedia.output(u"Wikidown or server's problem. Quit.") wikipedia.stopme() finally: # If there is the savedata, the script must save the number_user. - if random == True and savedata == True and number_user != None: + if random and savedata and number_user != None: f = file(filename, 'w') cPickle.dump(number_user, f) f.close()