Revision: 4630 Author: yurik Date: 2007-12-03 14:59:19 +0000 (Mon, 03 Dec 2007)
Log Message: ----------- Updated query and casechecker
Modified Paths: -------------- trunk/pywikipedia/casechecker.py trunk/pywikipedia/query.py
Modified: trunk/pywikipedia/casechecker.py =================================================================== --- trunk/pywikipedia/casechecker.py 2007-12-03 14:27:54 UTC (rev 4629) +++ trunk/pywikipedia/casechecker.py 2007-12-03 14:59:19 UTC (rev 4630) @@ -72,30 +72,28 @@ 'ru': u'[[ВП:КЛ]]', }
- langs = { - 'ru': { - 'alphabet' : u'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯяІі', - 'localsuspects': u'АаВЕеКкМНОоРрСсТуХхІі', - 'latinsuspects': u'AaBEeKkMHOoPpCcTyXxIi', - }, - 'uk': { - 'alphabet' : u'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЮюЯяЬь', - 'localsuspects': u'АаВЕеІіКкМНОоРрСсТУуХх', - 'latinsuspects': u'AaBEeIiKkMHOoPpCcTYyXx', - }, - 'bg': { - 'alphabet' : u'АаБбВвГгДдЕеЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя', - 'localsuspects': u'АаВЕеКкМНОоРрСсТуХх', - 'latinsuspects': u'AaBEeKkMHOoPpCcTyXx', - }, - 'be': { - 'alphabet' : u'АаБбВвГ㥴ДдЖжЗзЕеЁёЖжЗзІіЙйКкЛлМмНнОоПпРрСсТтУуЎўФфХхЦцЧчШшЫыЬьЭэЮюЯя', - 'localsuspects': u'АаВЕеІіКкМНОоРрСсТуХх', - 'latinsuspects': u'AaBEeIiKkMHOoPpCcTyXx', - }, + # These words are always in one language, even though they could be typed in both + alwaysInLocal = [ u'СССР', u'Как', u'как' ] + alwaysInLatin = [ u'II', u'III' ] + + localUpperLtr = u'ЁІЇЎАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯҐ' + localLowerLtr = u'ёіїўабвгдежзийклмнопрстуфхцчшщъыьэюяґ' + localLtr = localUpperLtr + localLowerLtr + + localSuspects = u'АВЕКМНОРСТХІЁЇаеорсухіёї' + latinSuspects = u'ABEKMHOPCTXIËÏaeopcyxiëï' + + localKeyboard = u'йцукенгшщзфывапролдячсмить' # possibly try to fix one character mistypes in an alternative keyboard layout + latinKeyboard = u'qwertyuiopasdfghjklzxcvbnm' + + romanNumChars = u'IVXLMC' + romannumSuffixes = localLowerLtr # all letters that may be used as suffixes after roman numbers: "Iый" + romanNumSfxPtrn = re.compile(u'^[' + romanNumChars + ']+[' + localLowerLtr + ']+$') + + whitelists = { + 'ru': u'ВП:КЛ/Whitelist' } - - knownWords = set([u'Zемфира', u'KoЯn', u'Deadушки', u'ENTERМУЗЫКА', u'Юz', u'Lюк', u'Яndex', u'КариZма', u'Стогoff', u'UltraВожык', u'Hardcoreманія', u'БМАgroup', u'Tviй', u'Undergroўnd', u'recordц', u'Bэzu']) + latLtr = u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
lclClrFnt = u'<font color=green>' @@ -163,16 +161,10 @@
self.site = wikipedia.getSite()
- if self.site.lang in self.langs: - l = self.langs[self.site.lang] - self.localSuspects = l['localsuspects'] - self.latinSuspects = l['latinsuspects'] - self.localLtr = l['alphabet'] - else: - raise ValueError(u'Unsupported site ' + self.site.lang) - if len(self.localSuspects) != len(self.latinSuspects): raise ValueError(u'Suspects must be the same size') + if len(self.localKeyboard) != len(self.latinKeyboard): + raise ValueError(u'Keyboard info must be the same size')
if not os.path.isabs(self.wikilogfile): self.wikilogfile = wikipedia.config.datafilepath(self.wikilogfile) @@ -188,10 +180,44 @@ self.localSuspects[i]) for i in range(len(self.localSuspects))])
+ if self.localKeyboard is not None: + self.lclToLatKeybDict = dict([(ord(self.localKeyboard[i]), + self.latinKeyboard[i]) + for i in range(len(self.localKeyboard))]) + self.latToLclKeybDict = dict([(ord(self.latinKeyboard[i]), + self.localKeyboard[i]) + for i in range(len(self.localKeyboard))]) + else: + self.lclToLatKeybDict = {} + self.latToLclKeybDict = {} + badPtrnStr = u'([%s][%s]|[%s][%s])' % (self.latLtr, self.localLtr, self.localLtr, self.latLtr) - self.badPtrn = re.compile(badPtrnStr) self.badWordPtrn = re.compile(u'[%s%s]*%s[%s%s]*' % (self.latLtr, self.localLtr, badPtrnStr, self.latLtr, self.localLtr) ) + + # Get whitelist + if self.site.lang in self.whitelists: + wlpage = self.whitelists[self.site.lang] + wikipedia.output(u'Loading whitelist from %s' % wlpage) + wlparams = { + 'action' : 'query', + 'prop' : 'links', + 'titles' : wlpage, + 'redirects' : '', + 'indexpageids' : '', + }
+ data = query.GetData(self.site.lang, wlparams, wikipedia.verbose, useAPI=True, encodeTitle=False) + if len(data['query']['pageids']) == 1: + pageid = data['query']['pageids'][0] + links = data['query']['pages'][pageid]['links'] + self.knownWords = set( [n['title'] for n in links] ) + else: + raise "The number of pageids is not 1" + wikipedia.output(u'Loaded whitelist with %i items' % len(self.knownWords)) + if wikipedia.verbose and len(self.knownWords) > 0: + wikipedia.output(u'Whitelist: [[%s]]' % u']], [['.join(self.knownWords)) + else: + wikipedia.output(u'Whitelist is not known for language %s' % self.site.lang)
def Run(self): try: @@ -341,6 +367,10 @@ if badWord in self.knownWords: continue
+ # Allow any roman numerals with local suffixes + if self.romanNumSfxPtrn.match(badWord) is not None: + continue + if not found: # lazy-initialization of the local variables possibleWords = [] @@ -364,6 +394,13 @@ mightBeLcl = False if l not in self.latLtr: raise "Assert failed"
+ # Some words are well known and frequently mixed-typed + if mightBeLcl and mightBeLat: + if badWord in self.alwaysInLocal: + mightBeLat = False + elif badWord in self.alwaysInLatin: + mightBeLoc = False + if mightBeLcl: mapLcl[badWord] = badWord.translate(self.latToLclDict) if mightBeLat:
Modified: trunk/pywikipedia/query.py =================================================================== --- trunk/pywikipedia/query.py 2007-12-03 14:27:54 UTC (rev 4629) +++ trunk/pywikipedia/query.py 2007-12-03 14:59:19 UTC (rev 4630) @@ -6,7 +6,7 @@ import urllib import time
-def GetData( lang, params, verbose = False, useAPI = False, retryCount = 5 ): +def GetData( lang, params, verbose = False, useAPI = False, retryCount = 5, encodeTitle = True ): """Get data from the query api, and convert it into a data object """ site = wikipedia.getSite( lang ) @@ -25,13 +25,13 @@ params[k] = ToUtf8(v)
# Titles param might be long, case convert it to post request + data = None + titlecount = 0 if 'titles' in params: - data = urllib.urlencode( {'titles' : params['titles']} ) titlecount = params['titles'].count('|') - del params['titles'] - else: - data = None - titlecount = 0 + if encodeTitle: + data = urllib.urlencode( {'titles' : params['titles']} ) + del params['titles']
if useAPI: path = site.api_address() + urllib.urlencode( params.items() ) @@ -54,7 +54,6 @@ # This will also work, but all unicode strings will need to be converted from \u notation # decodedObj = eval( jsontext ) return simplejson.loads( jsontext ) - break
except ValueError, error: retryCount -= 1