Revision: 4194 Author: yurik Date: 2007-09-04 03:54:13 +0000 (Tue, 04 Sep 2007)
Log Message: ----------- fixed raise expressions, added default wikilog
Modified Paths: -------------- trunk/pywikipedia/casechecker.py
Modified: trunk/pywikipedia/casechecker.py =================================================================== --- trunk/pywikipedia/casechecker.py 2007-09-03 23:53:48 UTC (rev 4193) +++ trunk/pywikipedia/casechecker.py 2007-09-04 03:54:13 UTC (rev 4194) @@ -3,7 +3,6 @@ """ Script to enumerate all pages on the wiki and find all titles with mixed latin and cyrilic alphabets. """ -__version__ = '$Id$'
# # Permutations code was taken from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/190465 @@ -18,6 +17,8 @@ yield [items[i]]+cc # End of permutation code
+__version__ = '$Id$' + # # Windows Concose colors # This code makes this script Windows ONLY!!! Feel free to adapt it to another platform @@ -38,7 +39,7 @@ if color == FOREGROUND_BLUE: print '(b:' if color == FOREGROUND_GREEN: print '(g:' if color == FOREGROUND_RED: print '(r:' - + # end of console code
@@ -71,14 +72,14 @@ 'latinsuspects': u'AaBEeIiKkMHOoPpCcTyXx', }, } - + knownWords = set([u'Zемфира', u'KoЯn', u'Deadушки', u'ENTERМУЗЫКА', u'Юz', u'Lюк', u'Яndex', u'КариZма', u'Стогoff', u'UltraВожык', u'Hardcoreманія', u'БМАgroup', u'Tviй', u'Undergroўnd', u'recordц', u'Bэzu']) latLtr = u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - + lclClrFnt = u'<font color=green>' latClrFnt = u'<font color=brown>' suffixClr = u'</font>' - + wordBreaker = re.compile(u'[ _-/|#[]()]')
titles = True @@ -90,11 +91,12 @@ stopAfter = 0 verbose = False wikilog = None + wikilogfile = 'wikilog.txt' autonomous = False namespaces = [] - + def __init__(self): - + for arg in wikipedia.handleArgs(): if arg.startswith('-from'): if arg.startswith('-from:'): @@ -119,10 +121,7 @@ elif arg.startswith('-ns:'): self.namespaces.append( int(arg[4:]) ) elif arg.startswith('-wikilog:'): - try: - self.wikilog = codecs.open(arg[9:], 'a', 'utf-8') - except IOError: - self.wikilog = codecs.open(arg[9:], 'w', 'utf-8') + self.wikilogfile = arg[9:] else: wikipedia.output(u'Unknown argument %s.' % arg) wikipedia.showHelp() @@ -141,28 +140,33 @@
if self.links: self.params['what'] += '|links|categories'; - + self.site = wikipedia.getSite() - + if self.site.lang in self.langs: l = self.langs[self.site.lang] self.localSuspects = l['localsuspects'] self.latinSuspects = l['latinsuspects'] self.localLtr = l['alphabet'] else: - raise u'Unsupported site ' + self.site.lang - + raise ValueError(u'Unsupported site ' + self.site.lang) + if len(self.localSuspects) != len(self.latinSuspects): - raise u'Suspects must be the same size' + raise ValueError(u'Suspects must be the same size')
+ try: + self.wikilog = codecs.open(self.wikilogfile, 'a', 'utf-8') + except IOError: + self.wikilog = codecs.open(self.wikilogfile, 'w', 'utf-8') + self.lclToLatDict = dict([(ord(self.localSuspects[i]), self.latinSuspects[i]) for i in range(len(self.localSuspects))]) self.latToLclDict = dict([(ord(self.latinSuspects[i]), self.localSuspects[i]) for i in range(len(self.localSuspects))]) - + badPtrnStr = u'([%s][%s]|[%s][%s])' % (self.latLtr, self.localLtr, self.localLtr, self.latLtr) self.badPtrn = re.compile(badPtrnStr) self.badWordPtrn = re.compile(u'[%s%s]*%s[%s%s]*' % (self.latLtr, self.localLtr, badPtrnStr, self.latLtr, self.localLtr) ) - - + + def Run(self): try: count = 0 @@ -170,8 +174,8 @@ for namespace in self.namespaces: self.params['apnamespace'] = namespace title = None - - while True: + + while True: # Get data self.params['apfrom'] = self.apfrom data = query.GetData(self.site.lang, self.params, self.verbose) @@ -179,7 +183,7 @@ self.apfrom = data['query']['allpages']['next'] except: self.apfrom = None - + # Process received data if 'pages' in data: firstItem = True @@ -204,11 +208,11 @@ src = wikipedia.Page(self.site, title) src.move( newTitle, u'mixed case rename') changed = True - + if not changed: self.WikiLog(u"* " + err[0]) printed = True - + if self.links: allLinks = None if 'links' in page: @@ -218,12 +222,12 @@ allLinks = allLinks + page['categories'] else: allLinks = page['categories'] - + if allLinks: pageObj = None pageTxt = None msg = [] - + for l in allLinks: ltxt = l['*'] err = self.ProcessTitle(ltxt) @@ -239,26 +243,26 @@ # pageTxt = pageTxt.replace(ltxt, newTitle) # pageTxt = pageTxt.replace(ltxt[0].lower() + ltxt[1:], newTitle[0].lower() + newTitle[1:]) # pageTxt = pageTxt.replace(ltxt.replace(u' ', '_'), newTitle) - + frmParts = self.wordBreaker.split(ltxt) toParts = self.wordBreaker.split(newTitle) if len(frmParts) != len(toParts): - raise u'Splitting parts do not match counts' + raise ValueError(u'Splitting parts do not match counts') for i in range(0, len(frmParts)): if len(frmParts[i]) != len(toParts[i]): - raise u'Splitting parts do not match word length' + raise ValueError(u'Splitting parts do not match word length') if len(frmParts[i]) > 0: pageTxt = pageTxt.replace(frmParts[i], toParts[i]) pageTxt = pageTxt.replace(frmParts[i][0].lower() + frmParts[i][1:], toParts[i][0].lower() + toParts[i][1:]) - + if not newTitle: if not printed: self.WikiLog(u"* [[:%s]]: link to %s" % (title, err[0])) printed = True else: self.WikiLog(u"** link to %s" % err[0]) - - + + if pageObj is not None: coloredMsg = u', '.join([self.ColorCodeWord(m) for m in msg]) if pageObj.get() == pageTxt: @@ -271,19 +275,19 @@ raise except: self.WikiLog(u"* Error: Could not save updated page [[:%s]] (%s)" % (title, coloredMsg)) - - + + count += 1 if self.stopAfter > 0 and count == self.stopAfter: raise "Stopping because we are done" - + if self.apfrom is None: break - + self.apfrom = u'' # Restart apfrom for other namespaces
print "***************************** Done" - + except: if self.apfrom is not None: wikipedia.output(u'Exception at Title = %s, Next = %s' % (title, self.apfrom)) @@ -292,15 +296,14 @@
def WikiLog(self, text): wikipedia.output(text) - if self.wikilog: - self.wikilog.write(text + u'\n') - self.wikilog.flush() - + self.wikilog.write(text + u'\n') + self.wikilog.flush() + def ProcessTitle(self, title): - + found = False for m in self.badWordPtrn.finditer(title): - + badWord = title[m.span()[0] : m.span()[1]] if badWord in self.knownWords: continue @@ -316,7 +319,7 @@ mapLcl = {} mapLat = {} found = True - + # See if it would make sense to treat the whole word as either cyrilic or latin mightBeLat = mightBeLcl = True for l in badWord: @@ -327,7 +330,7 @@ if mightBeLcl and l not in self.latinSuspects: mightBeLcl = False if l not in self.latLtr: raise "Assert failed" - + if mightBeLcl: mapLcl[badWord] = badWord.translate(self.latToLclDict) if mightBeLat: @@ -339,10 +342,10 @@
if not found: return None - + infoText = self.MakeLink(title) possibleAlternatives = [] - + if len(mapLcl) + len(mapLat) - ambigBadWordsCount < count: # We cannot auto-translate - offer a list of suggested words suggestions = mapLcl.values() + mapLat.values() @@ -351,7 +354,7 @@ else: infoText += u", no suggestions" else: - + # Replace all unambiguous bad words for k,v in mapLat.items() + mapLcl.items(): if k not in ambigBadWords: @@ -382,19 +385,19 @@ infoText += u", no suggestions"
return (infoText, possibleAlternatives) - + def PickTarget(self, isLink, original, candidates): if len(candidates) == 0: return None - + if isLink: if len(candidates) == 1: return candidates[0] - + pagesDontExist = [] pagesRedir = {} pagesExist = [] - + for newTitle in candidates: dst = wikipedia.Page(self.site, newTitle) if not dst.exists(): @@ -403,7 +406,7 @@ pagesRedir[newTitle] = dst.getRedirectTarget() else: pagesExist.append(newTitle) - + if len(pagesExist) == 1: return pagesExist[0] elif len(pagesExist) == 0 and len(pagesRedir) > 0: @@ -419,7 +422,7 @@ # all redirects point to the same target # pick the first one, doesn't matter what it is return pagesRedir.keys()[0] - + if not self.autonomous: wikipedia.output(u'Could not auto-decide. Which should be chosen?') wikipedia.output(u'Original title: ', newline=False) @@ -431,7 +434,7 @@ else: msg = u'page exists' self.ColorCodeWord(u' %d: %s (%s)\n' % (count, t, msg), True) count += 1 - + answers = [str(i) for i in range(0, count)] choice = int(wikipedia.inputChoice(u'Which link to choose? (0 to skip)', answers, [a[0] for a in answers])) if choice > 0: @@ -444,11 +447,11 @@ if not dst.exists(): # choice = wikipedia.inputChoice(u'Move %s to %s?' % (title, newTitle), ['Yes', 'No'], ['y', 'n']) return newTitle - + return None
def ColorCodeWord(self, word, toScreen = False): - + if not toScreen: res = u"<b>" lastIsCyr = word[0] in self.localLtr if lastIsCyr: @@ -471,14 +474,14 @@ lastIsCyr = False if toScreen: wikipedia.output(l, newline=False) else: res += l - + if toScreen: SetColor(FOREGROUND_WHITE) else: return res + self.suffixClr + u"</b>" -
+ def MakeLink(self, title): return u"[[:%s|««« %s »»»]]" % (title, self.ColorCodeWord(title)) - + if __name__ == "__main__": try: bot = CaseChecker()