Revision: 4630
Author: yurik
Date: 2007-12-03 14:59:19 +0000 (Mon, 03 Dec 2007)
Log Message:
-----------
Updated query and casechecker
Modified Paths:
--------------
trunk/pywikipedia/casechecker.py
trunk/pywikipedia/query.py
Modified: trunk/pywikipedia/casechecker.py
===================================================================
--- trunk/pywikipedia/casechecker.py 2007-12-03 14:27:54 UTC (rev 4629)
+++ trunk/pywikipedia/casechecker.py 2007-12-03 14:59:19 UTC (rev 4630)
@@ -72,30 +72,28 @@
'ru': u'[[ВП:КЛ]]',
}
- langs = {
- 'ru': {
- 'alphabet' : u'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯяІі',
- 'localsuspects': u'АаВЕеКкМНОоРрСсТуХхІі',
- 'latinsuspects': u'AaBEeKkMHOoPpCcTyXxIi',
- },
- 'uk': {
- 'alphabet' : u'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЮюЯяЬь',
- 'localsuspects': u'АаВЕеІіКкМНОоРрСсТУуХх',
- 'latinsuspects': u'AaBEeIiKkMHOoPpCcTYyXx',
- },
- 'bg': {
- 'alphabet' : u'АаБбВвГгДдЕеЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЮюЯя',
- 'localsuspects': u'АаВЕеКкМНОоРрСсТуХх',
- 'latinsuspects': u'AaBEeKkMHOoPpCcTyXx',
- },
- 'be': {
- 'alphabet' : u'АаБбВвГ㥴ДдЖжЗзЕеЁёЖжЗзІіЙйКкЛлМмНнОоПпРрСсТтУуЎўФфХхЦцЧчШшЫыЬьЭэЮюЯя',
- 'localsuspects': u'АаВЕеІіКкМНОоРрСсТуХх',
- 'latinsuspects': u'AaBEeIiKkMHOoPpCcTyXx',
- },
+ # These words are always in one language, even though they could be typed in both
+ alwaysInLocal = [ u'СССР', u'Как', u'как' ]
+ alwaysInLatin = [ u'II', u'III' ]
+
+ localUpperLtr = u'ЁІЇЎАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯҐ'
+ localLowerLtr = u'ёіїўабвгдежзийклмнопрстуфхцчшщъыьэюяґ'
+ localLtr = localUpperLtr + localLowerLtr
+
+ localSuspects = u'АВЕКМНОРСТХІЁЇаеорсухіёї'
+ latinSuspects = u'ABEKMHOPCTXIËÏaeopcyxiëï'
+
+ localKeyboard = u'йцукенгшщзфывапролдячсмить' # possibly try to fix one character mistypes in an alternative keyboard layout
+ latinKeyboard = u'qwertyuiopasdfghjklzxcvbnm'
+
+ romanNumChars = u'IVXLMC'
+ romannumSuffixes = localLowerLtr # all letters that may be used as suffixes after roman numbers: "Iый"
+ romanNumSfxPtrn = re.compile(u'^[' + romanNumChars + ']+[' + localLowerLtr + ']+$')
+
+ whitelists = {
+ 'ru': u'ВП:КЛ/Whitelist'
}
-
- knownWords = set([u'Zемфира', u'KoЯn', u'Deadушки', u'ENTERМУЗЫКА', u'Юz', u'Lюк', u'Яndex', u'КариZма', u'Стогoff', u'UltraВожык', u'Hardcoreманія', u'БМАgroup', u'Tviй', u'Undergroўnd', u'recordц', u'Bэzu'])
+
latLtr = u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
lclClrFnt = u'<font color=green>'
@@ -163,16 +161,10 @@
self.site = wikipedia.getSite()
- if self.site.lang in self.langs:
- l = self.langs[self.site.lang]
- self.localSuspects = l['localsuspects']
- self.latinSuspects = l['latinsuspects']
- self.localLtr = l['alphabet']
- else:
- raise ValueError(u'Unsupported site ' + self.site.lang)
-
if len(self.localSuspects) != len(self.latinSuspects):
raise ValueError(u'Suspects must be the same size')
+ if len(self.localKeyboard) != len(self.latinKeyboard):
+ raise ValueError(u'Keyboard info must be the same size')
if not os.path.isabs(self.wikilogfile):
self.wikilogfile = wikipedia.config.datafilepath(self.wikilogfile)
@@ -188,10 +180,44 @@
self.localSuspects[i])
for i in range(len(self.localSuspects))])
+ if self.localKeyboard is not None:
+ self.lclToLatKeybDict = dict([(ord(self.localKeyboard[i]),
+ self.latinKeyboard[i])
+ for i in range(len(self.localKeyboard))])
+ self.latToLclKeybDict = dict([(ord(self.latinKeyboard[i]),
+ self.localKeyboard[i])
+ for i in range(len(self.localKeyboard))])
+ else:
+ self.lclToLatKeybDict = {}
+ self.latToLclKeybDict = {}
+
badPtrnStr = u'([%s][%s]|[%s][%s])' % (self.latLtr, self.localLtr, self.localLtr, self.latLtr)
- self.badPtrn = re.compile(badPtrnStr)
self.badWordPtrn = re.compile(u'[%s%s]*%s[%s%s]*' % (self.latLtr, self.localLtr, badPtrnStr, self.latLtr, self.localLtr) )
+
+ # Get whitelist
+ if self.site.lang in self.whitelists:
+ wlpage = self.whitelists[self.site.lang]
+ wikipedia.output(u'Loading whitelist from %s' % wlpage)
+ wlparams = {
+ 'action' : 'query',
+ 'prop' : 'links',
+ 'titles' : wlpage,
+ 'redirects' : '',
+ 'indexpageids' : '',
+ }
+ data = query.GetData(self.site.lang, wlparams, wikipedia.verbose, useAPI=True, encodeTitle=False)
+ if len(data['query']['pageids']) == 1:
+ pageid = data['query']['pageids'][0]
+ links = data['query']['pages'][pageid]['links']
+ self.knownWords = set( [n['title'] for n in links] )
+ else:
+ raise "The number of pageids is not 1"
+ wikipedia.output(u'Loaded whitelist with %i items' % len(self.knownWords))
+ if wikipedia.verbose and len(self.knownWords) > 0:
+ wikipedia.output(u'Whitelist: [[%s]]' % u']], [['.join(self.knownWords))
+ else:
+ wikipedia.output(u'Whitelist is not known for language %s' % self.site.lang)
def Run(self):
try:
@@ -341,6 +367,10 @@
if badWord in self.knownWords:
continue
+ # Allow any roman numerals with local suffixes
+ if self.romanNumSfxPtrn.match(badWord) is not None:
+ continue
+
if not found:
# lazy-initialization of the local variables
possibleWords = []
@@ -364,6 +394,13 @@
mightBeLcl = False
if l not in self.latLtr: raise "Assert failed"
+ # Some words are well known and frequently mixed-typed
+ if mightBeLcl and mightBeLat:
+ if badWord in self.alwaysInLocal:
+ mightBeLat = False
+ elif badWord in self.alwaysInLatin:
+ mightBeLoc = False
+
if mightBeLcl:
mapLcl[badWord] = badWord.translate(self.latToLclDict)
if mightBeLat:
Modified: trunk/pywikipedia/query.py
===================================================================
--- trunk/pywikipedia/query.py 2007-12-03 14:27:54 UTC (rev 4629)
+++ trunk/pywikipedia/query.py 2007-12-03 14:59:19 UTC (rev 4630)
@@ -6,7 +6,7 @@
import urllib
import time
-def GetData( lang, params, verbose = False, useAPI = False, retryCount = 5 ):
+def GetData( lang, params, verbose = False, useAPI = False, retryCount = 5, encodeTitle = True ):
"""Get data from the query api, and convert it into a data object
"""
site = wikipedia.getSite( lang )
@@ -25,13 +25,13 @@
params[k] = ToUtf8(v)
# Titles param might be long, case convert it to post request
+ data = None
+ titlecount = 0
if 'titles' in params:
- data = urllib.urlencode( {'titles' : params['titles']} )
titlecount = params['titles'].count('|')
- del params['titles']
- else:
- data = None
- titlecount = 0
+ if encodeTitle:
+ data = urllib.urlencode( {'titles' : params['titles']} )
+ del params['titles']
if useAPI:
path = site.api_address() + urllib.urlencode( params.items() )
@@ -54,7 +54,6 @@
# This will also work, but all unicode strings will need to be converted from \u notation
# decodedObj = eval( jsontext )
return simplejson.loads( jsontext )
- break
except ValueError, error:
retryCount -= 1