Xqt has submitted this change and it was merged.
Change subject: wiktionary: pep8 changes, small code improvements ......................................................................
wiktionary: pep8 changes, small code improvements
Change-Id: I1bb2f0ee00a881a31e35a1204dc07b893b9aaf3c --- M wiktionary/entry.py M wiktionary/header.py M wiktionary/headertest.py M wiktionary/meaning.py M wiktionary/meaningtest.py M wiktionary/sortonlanguagename.py M wiktionary/structs.py M wiktionary/term.py M wiktionary/termtest.py M wiktionary/wiktionarypage.py M wiktionary/wiktionarypagetest.py 11 files changed, 1,328 insertions(+), 993 deletions(-)
Approvals: Xqt: Looks good to me, approved
diff --git a/wiktionary/entry.py b/wiktionary/entry.py index a0392e8..b9d7c48 100644 --- a/wiktionary/entry.py +++ b/wiktionary/entry.py @@ -7,84 +7,111 @@ import meaning import structs
+ class Entry: """ This class contains the entries that belong together on one page. - On Wiktionaries that are still on first character capitalization, this means both [[Kind]] and [[kind]]. - Terms in different languages can be described. Usually there is one entry for each language. + On Wiktionaries that are still on first character capitalization, this + means both [[Kind]] and [[kind]]. + Terms in different languages can be described. Usually there is one entry + for each language. + """
- def __init__(self,entrylang,meaning=""): + def __init__(self, entrylang, meaning=""): """ Constructor - Called with one parameter: - - the language of this entry + Called with one parameter: + - the language of this entry and can optionally be initialized with a first meaning + """ - self.entrylang=entrylang - self.meanings = {} # a dictionary containing the meanings for this term grouped by part of speech - if meaning: - self.addMeaning(meaning) - self.posorder = [] # we don't want to shuffle the order of the parts of speech, so we keep a list to keep the order in which they were encountered + self.entrylang = entrylang + # a dictionary containing the meanings for this term grouped by part of + # speech: + self.meanings = {}
- def addMeaning(self,meaning): + if meaning: + self.addMeaning(meaning) + # we don't want to shuffle the order of the parts of speech, so we keep + # a list to keep the order in which they were encountered: + self.posorder = [] + + def addMeaning(self, meaning): """ Lets you add another meaning to this entry """ - term = meaning.term # fetch the term, in order to be able to determine its part of speech in the next step + # fetch the term, in order to be able to determine its part of speech + # in the next step + term = meaning.term
- self.meanings.setdefault( term.pos, [] ).append(meaning) - if not term.pos in self.posorder: # we only need each part of speech once in our list where we keep track of the order + self.meanings.setdefault(term.pos, []).append(meaning) + # we only need each part of speech once in our list where we keep track + # of the order + if not term.pos in self.posorder: self.posorder.append(term.pos)
def getMeanings(self): - """ Returns a dictionary containing all the meaning objects for this entry + """ Returns a dictionary containing all the meaning objects for this + entry + """ return self.meanings
- def wikiWrap(self,wikilang): + def wikiWrap(self, wikilang): """ Returns a string for this entry in a format ready for Wiktionary + """ - entry = structs.wiktionaryformats[wikilang]['langheader'].replace('%%langname%%',langnames[wikilang][self.entrylang]).replace('%%ISOLangcode%%',self.entrylang) + '\n' + entry = structs.wiktionaryformats[wikilang]['langheader'].replace( + '%%langname%%', langnames[wikilang][self.entrylang]).replace( + '%%ISOLangcode%%', self.entrylang) + '\n'
for pos in self.posorder: meanings = self.meanings[pos] - entry += structs.wiktionaryformats[wikilang]['posheader'][pos] - entry +='\n' - if wikilang=='en': - entry = entry + meanings[0].term.wikiWrapAsExample(wikilang) + '\n\n' + entry += '\n' + if wikilang == 'en': + entry += meanings[0].term.wikiWrapAsExample(wikilang) + '\n\n' for meaning in meanings: - entry = entry + '#' + meaning.getLabel() + ' ' + meaning.definition + '\n' - entry = entry + meaning.wikiWrapExamples() - entry +='\n' + entry += '#%s %s\n' % (meaning.getLabel(), + meaning.definition) + entry += meaning.wikiWrapExamples() + entry += '\n'
- if wikilang=='nl': + if wikilang == 'nl': for meaning in meanings: - term=meaning.term - entry = entry + meaning.getLabel() + term.wikiWrapAsExample(wikilang) + '; ' + meaning.definition + '\n' - entry = entry + meaning.wikiWrapExamples() - entry +='\n' + term = meaning.term + entry += meaning.getLabel() + term.wikiWrapAsExample( + wikilang) + '; %s\n' % meaning.definition + entry += meaning.wikiWrapExamples() + entry += '\n'
if meaning.hasSynonyms(): - entry = entry + structs.wiktionaryformats[wikilang]['synonymsheader'] + '\n' + entry += '%s\n' % ( + structs.wiktionaryformats[wikilang]['synonymsheader']) for meaning in meanings: - entry = entry + '*' + meaning.getLabel() + "'''" + meaning.getConciseDef() + "''': " + meaning.wikiWrapSynonyms(wikilang) - entry +='\n' + entry += "*%s'''%s''': %s" % (meaning.getLabel(), + meaning.getConciseDef(), + meaning.wikiWrapSynonyms( + wikilang)) + entry += '\n'
if meaning.hasTranslations(): - entry = entry + structs.wiktionaryformats[wikilang]['translationsheader'] + '\n' + entry += '%s\n' % ( + structs.wiktionaryformats[wikilang]['translationsheader']) for meaning in meanings: - entry = entry + meaning.getLabel() + "'''" + meaning.getConciseDef() + "'''" + '\n' + meaning.wikiWrapTranslations(wikilang,self.entrylang) + '\n\n' - entry +='\n' + entry += "%s'''%s'''\n%s\n\n" % ( + meaning.getLabel(), meaning.getConciseDef(), + meaning.wikiWrapTranslations(wikilang, self.entrylang)) + entry += '\n' return entry
- def showContents(self,indentation): + def showContents(self, indentation): """ Prints the contents of all the subobjects contained in this entry. - Every subobject is indented a little further on the screen. - The primary purpose is to help keep your sanity while debugging. - """ - print ' ' * indentation + 'entrylang = %s'% self.entrylang + Every subobject is indented a little further on the screen. + The primary purpose is to help keep your sanity while debugging.
+ """ + print ' ' * indentation + 'entrylang = %s' % self.entrylang print ' ' * indentation + 'posorder:' + repr(self.posorder)
meaningkeys = self.meanings.keys() for meaningkey in meaningkeys: for meaning in self.meanings[meaningkey]: - meaning.showContents(indentation+2) + meaning.showContents(indentation + 2) diff --git a/wiktionary/header.py b/wiktionary/header.py index 9a941b3..6e5d8f5 100644 --- a/wiktionary/header.py +++ b/wiktionary/header.py @@ -3,69 +3,75 @@
from structs import *
+ class Header(object): - def __init__(self,line=None,contents=None,header=None,level=None,type=None): + def __init__(self, line=None, contents=None, header=None, level=None, + type=None): """ Constructor - Generally called with one parameter: - - The line read from a Wiktonary page - after determining it's probably a header + Generally called with one parameter: + - The line read from a Wiktonary page + after determining it's probably a header + """
- # sane defaults for self - self.contents=None - self.header=None - self.level=None - self.type=None + self.contents = None + self.header = None + self.level = None + self.type = None
- # settings for self - if line!=None: self.parseLine(line) - if contents!=None: self.contents=contents - if header!=None: self.header=header - if level!=None: self.level=level - if type!=None: self.type=type + if line is not None: + self.parseLine(line) + if contents is not None: + self.contents = contents + if header is not None: + self.header = header + if level is not None: + self.level = level + if type is not None: + self.type = type
- def __eq__(x,y): + def __eq__(x, y): """x.__eq__(y) <==> x==y"""
- return hasattr(x,"__dict__") and hasattr(y,"__dict__") and x.__dict__==y.__dict__ + return hasattr(x, "__dict__") and hasattr(y, "__dict__") and \ + x.__dict__ == y.__dict__
- def __ne__(x,y): + def __ne__(x, y): """x.__ne__(y) <==> x!=y"""
- return (not hasattr(x,"__eq__")) and (not x.__eq__(y)) + return (not hasattr(x, "__eq__")) and (not x.__eq__(y))
- def parseLine(self,line): - self.level=None - self.type='' # The type of header, i.e. lang, pos, other - self.contents='' # If lang, which lang? If pos, which pos? + def parseLine(self, line): + self.level = None + self.type = '' # The type of header, i.e. lang, pos, other + self.contents = '' # If lang, which lang? If pos, which pos?
self.header = '' - if line.count('=')>1: - self.level = line.count('=') // 2 # integer floor division without fractional part - self.header = line.replace('=','') + if line.count('=') > 1: + # integer floor division without fractional part + self.level = line.count('=') // 2 + self.header = line.replace('=', '') elif '{{' in line: - self.header = line.replace('{{-','').replace('-}}','') + self.header = line.replace('{{-', '').replace('-}}', '')
- self.header = self.header.replace('{{','').replace('}}','').strip().lower() + self.header = self.header.replace('{{', + '').replace('}}', '').strip().lower()
- # Now we know the content of the header, let's try to find out what it means: + # Now we know the content of the header, let's try to find out what it + # means: if self.header in pos: - self.type=u'pos' - self.contents=pos[self.header] + self.type = u'pos' + self.contents = pos[self.header] if self.header in langnames: - self.type=u'lang' - self.contents=self.header + self.type = u'lang' + self.contents = self.header if self.header in invertedlangnames: - self.type=u'lang' - self.contents=invertedlangnames[self.header] + self.type = u'lang' + self.contents = invertedlangnames[self.header] if self.header in otherheaders: - self.type=u'other' - self.contents=otherheaders[self.header] + self.type = u'other' + self.contents = otherheaders[self.header]
def __repr__(self): - return self.__module__+".Header("+\ - "contents='"+self.contents+\ - "', header='"+self.header+\ - "', level="+str(self.level)+\ - ", type='"+self.type+\ - "')" + return "%s.Header(contents='%s', header='%s', level=%d, type='%s')" % ( + self.__module__, self.contents, self.header, self.level, self.type) diff --git a/wiktionary/headertest.py b/wiktionary/headertest.py index db01f90..81278b5 100644 --- a/wiktionary/headertest.py +++ b/wiktionary/headertest.py @@ -6,24 +6,27 @@ import header import unittest
+ class KnownValues(unittest.TestCase): knownValues = ( - ('==English==', 'en', 2, 'lang'), - ('=={{en}}==', 'en', 2, 'lang'), - ('{{-en-}}', 'en', None, 'lang'), - ('===Noun===', 'noun', 3, 'pos'), - ('==={{noun}}===', 'noun', 3, 'pos'), - ('{{-noun-}}', 'noun', None, 'pos'), - ('===Verb===', 'verb', 3, 'pos'), - ('==={{verb}}===', 'verb', 3, 'pos'), - ('{{-verb-}}', 'verb', None, 'pos'), - ('====Translations====', 'trans', 4, 'other'), - ('===={{trans}}====', 'trans', 4, 'other'), - ('{{-trans-}}', 'trans', None, 'other'), - ) + ('==English==', 'en', 2, 'lang'), + ('=={{en}}==', 'en', 2, 'lang'), + ('{{-en-}}', 'en', None, 'lang'), + ('===Noun===', 'noun', 3, 'pos'), + ('==={{noun}}===', 'noun', 3, 'pos'), + ('{{-noun-}}', 'noun', None, 'pos'), + ('===Verb===', 'verb', 3, 'pos'), + ('==={{verb}}===', 'verb', 3, 'pos'), + ('{{-verb-}}', 'verb', None, 'pos'), + ('====Translations====', 'trans', 4, 'other'), + ('===={{trans}}====', 'trans', 4, 'other'), + ('{{-trans-}}', 'trans', None, 'other'), + )
def testHeaderInitKnownValuesContents(self): - """Header parsing comparing known result with known input for contents""" + """Header parsing comparing known result with known input for contents + + """ for wikiline, contents, level, type in self.knownValues: result = header.Header(wikiline).contents self.assertEqual(contents, result) @@ -43,10 +46,10 @@ def testReprSanity(self): """Header __repr__, __eq__, __ne__ should give sane results""" for stuff in self.knownValues: - wikiline=stuff[0] - h=header.Header(wikiline) - self.assertEqual(h, eval(repr(h)) ) - self.assertNotEqual(h,header.Header()) + wikiline = stuff[0] + h = header.Header(wikiline) + self.assertEqual(h, eval(repr(h))) + self.assertNotEqual(h, header.Header())
if __name__ == "__main__": unittest.main() diff --git a/wiktionary/meaning.py b/wiktionary/meaning.py index a7019b5..e74b3ff 100644 --- a/wiktionary/meaning.py +++ b/wiktionary/meaning.py @@ -5,148 +5,170 @@ import structs import re
+ class Meaning: """ This class contains one meaning for a word or an expression. """ - def __init__(self,term,definition='',etymology='',synonyms={'remark': '', 'synonyms': [{'remark': '', 'synonym': ''}]},translations={},label='',concisedef='',examples=[]): + def __init__(self, term, definition='', etymology='', + synonyms={'remark': '', + 'synonyms': [{'remark': '', 'synonym': ''}]}, + translations=None, label='', concisedef='', examples=[]): """ Constructor - Generally called with one parameter: - - The Term object we are describing + Generally called with one parameter: + - The Term object we are describing
- - definition (string) for this term is optional - - etymology (string) is optional - - synonyms (optional) - - translations (dictionary of Term objects, ISO639 is the key) is optional + - definition (string) for this term is optional + - etymology (string) is optional + - synonyms (optional) + - translations (dictionary of Term objects, ISO639 is the key) is + optional + """ - self.term=term - self.definition=definition - self.concisedef=concisedef - self.etymology=etymology - self.synonyms=synonyms + self.term = term + self.definition = definition + self.concisedef = concisedef + self.etymology = etymology + self.synonyms = synonyms # A structure, possibly containing the following items: # {'remark' : 'this remark concerns all the synonyms for this meaning', # 'synonyms' : [ - # {'remark': 'this remark concerns this particular synonym', + # {'remark': 'concerns this particular synonym', # 'synonym': Term object containing the synonym # }, # ] - self.examples=examples - self.label=label - - if translations: # Why this has to be done explicitly is beyond me, but it doesn't work correctly otherwise - self.translations=translations + self.examples = examples + self.label = label + if translations: + self.translations = translations else: - self.translations={} # a dictionary containing lists with translations to the different languages. Each translation is again a dictionary as follows: {'remark': '', 'trans': Term object} - self.translationsremark='' # a remark applying to all the translations for this meaning - self.translationsremarks={} # a dictionary containing remarks applying to a specific language - self.label=label + # a dictionary containing lists with translations to the different + # languages. Each translation is again a dictionary as follows: + # {'remark': '', 'trans': Term object} + self.translations = {} + # a remark applying to all the translations for this meaning + self.translationsremark = '' + # a dictionary containing remarks applying to a specific language + self.translationsremarks = {} + self.label = label
- def setDefinition(self,definition): + def setDefinition(self, definition): """ Provide a definition """ - self.definition=definition + self.definition = definition
def getDefinition(self): """ Returns the definition """ return self.definition
- def setEtymology(self,etymology): + def setEtymology(self, etymology): """ Provide the etymology """ - self.etymology=etymology + self.etymology = etymology
def getEtymology(self): """ Returns the etymology """ return self.etymology
- def setSynonyms(self,synonyms): + def setSynonyms(self, synonyms): """ Provide the synonyms """ - self.synonyms=synonyms + self.synonyms = synonyms
def getSynonyms(self): """ Returns the list of synonym Term objects """ return self.synonyms
- def parseSynonyms(self,synonymswikiline): + def parseSynonyms(self, synonymswikiline): synsremark = '' synonyms = [] - openparenthesis=synonymswikiline.lower().find('(see') - if openparenthesis!=-1: - closeparenthesis=synonymswikiline.find(')',openparenthesis) - synsremark=synonymswikiline[openparenthesis:closeparenthesis+1] - synonymswikiline=synonymswikiline[:openparenthesis-1] + synonymswikiline[closeparenthesis+1:] + openparenthesis = synonymswikiline.lower().find('(see') + if openparenthesis != -1: + closeparenthesis = synonymswikiline.find(')', openparenthesis) + synsremark = synonymswikiline[openparenthesis:closeparenthesis + 1] + synonymswikiline = synonymswikiline[:openparenthesis - 1] + \ + synonymswikiline[closeparenthesis + 1:] for synonym in synonymswikiline.split(','): synremark = '' - openparenthesis=synonym.lower().find('(') - if openparenthesis!=-1: - closeparenthesis=synonym.find(')',openparenthesis) - synremark=synonym[openparenthesis:closeparenthesis+1] - synonym=synonym[:openparenthesis-1] + synonym[closeparenthesis+2:] - synonym=synonym.replace(',','').replace("[",'').replace(']','').strip() + openparenthesis = synonym.lower().find('(') + if openparenthesis != -1: + closeparenthesis = synonym.find(')', openparenthesis) + synremark = synonym[openparenthesis:closeparenthesis + 1] + synonym = synonym[:openparenthesis - 1] + \ + synonym[closeparenthesis + 2:] + synonym = synonym.replace( + ',', '').replace("[", '').replace(']', '').strip() synonyms.append({'synonym': synonym, 'remark': synremark}) - self.synonyms={'remark': synsremark, 'synonyms': synonyms} + self.synonyms = {'remark': synsremark, 'synonyms': synonyms}
- def parseTranslations(self,translationswikiline): + def parseTranslations(self, translationswikiline): ''' This function will parse one line in wiki format Typically this is the translation towards one language. ''' - # There can be many translations for a language, each one can have remark - # a gender and a number. - # There can also be a remark for the group of translations for a given language - # And there can be a remark applying to all the translations (That has to be detected and stored on a higher level though. - # It is also possible that the translation for a given language is not parseable - # In that case the entire line should go into the remark. + # There can be many translations for a language, each one can have + # remark a gender and a number. + # There can also be a remark for the group of translations for a given + # language. And there can be a remark applying to all the translations + # (That has to be detected and stored on a higher level though. + # It is also possible that the translation for a given language is not + # parseable. In that case the entire line should go into the remark. translationsremark = translationremark = '' - translations = [] # a list of translations for a given language - colon=translationswikiline.find(':') - if colon!=-1: - # Split in lang and the rest of the line which should be a list of translations - lang = translationswikiline[:colon].replace('*','').replace('[','').replace(']','').replace('{','').replace('}','').strip().lower() - trans = translationswikiline[colon+1:] + translations = [] # a list of translations for a given language + colon = translationswikiline.find(':') + if colon != -1: + # Split in lang and the rest of the line which should be a list of + # translations + lang = translationswikiline[:colon].replace( + '*', '').replace('[', '').replace(']', '').replace( + '{', '').replace('}', '').strip().lower() + trans = translationswikiline[colon + 1:] # Look up lang and convert to an ISO abbreviation - isolang='' + isolang = '' if lang in structs.langnames: - isolang=lang + isolang = lang elif lang in structs.invertedlangnames: - isolang=structs.invertedlangnames[lang] + isolang = structs.invertedlangnames[lang]
# We need to prepare the line a bit to make it more easily parseable # All the commas found between '' '' are converted to simple spaces # Also }}, {{ has to be converted to }} {{
- trans="''".join([ [i[1],re.sub(',',' ',i[1])][i[0]%2==1] for i in enumerate(trans.split("''")) ]) + trans = "''".join([[i[1], re.sub(',', ' ', i[1])][i[0] % 2 == 1] + for i in enumerate(trans.split("''"))])
- trans=re.sub(r"(}}.*),(.*{{)",'}} {{',trans) + trans = re.sub(r"(}}.*),(.*{{)", '}} {{', trans)
# Now split up the translations (we got rid of extraneous commas) for translation in trans.split(','): - translation=translation.strip() + translation = translation.strip() # Find what is contained inside parentheses - m= re.search(r'((.*))',translation) + m = re.search(r'((.*))', translation) if m: # Only when the parentheses don't occur # between [[ ]] - if translation[m.end(1)+1:m.end(1)+2]!=']': - translationremark = m.group(1).replace('(','').replace(')','') - translation=translation.replace(m.group(1),'') + if translation[m.end(1) + 1:m.end(1) + 2] != ']': + translationremark = m.group(1).replace( + '(', '').replace(')', '') + translation = translation.replace(m.group(1), '') number = 1 masculine = feminine = neutral = common = diminutive = False partconsumed = False for part in translation.split(' '): - part=part.strip() - colon=part.find(':') - if colon!=-1: - colon2=part.find(':',colon+1) - pipe=part.find('|') - if colon2!=-1 and pipe!=-1: + part = part.strip() + colon = part.find(':') + if colon != -1: + colon2 = part.find(':', colon + 1) + pipe = part.find('|') + if colon2 != -1 and pipe != -1: # We found a link to another language Wiktionary # This contains no interesting information to store - # If the target Wiktionary uses them, we'll create them upon output + # If the target Wiktionary uses them, we'll create + # them upon output pass else: - translationremark = part.replace("'",'').replace('(','').replace(')','').replace(':','') + translationremark = part.replace( + "'", '').replace('(', '').replace( + ')', '').replace(':', '') partconsumed = True - cleanpart=part.replace("'",'').lower() - delim='' + cleanpart = part.replace("'", '').lower() + delim = '' # XXX The following 3 tests look wrong: # find() returns either -1 if the substring is not found, # or the position of the substring in the string. @@ -155,120 +177,149 @@ # # the test "',' in cleanpart" might be the one to use. if cleanpart.find(','): - delim=',' + delim = ',' if cleanpart.find(';'): - delim=';' + delim = ';' if cleanpart.find('/'): - delim='/' + delim = '/' if 0 <= part.find("'") <= 2 or '{' in part: - if delim=='': - delim='|' - cleanpart=cleanpart+'|' + if delim == '': + delim = '|' + cleanpart += '|' for maybegender in cleanpart.split(delim): - maybegender=maybegender.strip() - if maybegender=='m' or maybegender=='{{m}}': - masculine=True + maybegender = maybegender.strip() + if maybegender == 'm' or maybegender == '{{m}}': + masculine = True partconsumed = True - if maybegender=='f' or maybegender=='{{f}}': - feminine=True + if maybegender == 'f' or maybegender == '{{f}}': + feminine = True partconsumed = True - if maybegender=='n' or maybegender=='{{n}}': - neutral=True + if maybegender == 'n' or maybegender == '{{n}}': + neutral = True partconsumed = True - if maybegender=='c' or maybegender=='{{c}}': - common=True + if maybegender == 'c' or maybegender == '{{c}}': + common = True partconsumed = True - if maybegender=='p' or maybegender=='pl' or maybegender=='plural' or maybegender=='{{p}}': - number=2 + if maybegender == 'p' or maybegender == 'pl' or \ + maybegender == 'plural' or \ + maybegender == '{{p}}': + number = 2 partconsumed = True - if maybegender[:3]=='dim' or maybegender=='{{dim}}': - diminutive=True + if maybegender[:3] == 'dim' or \ + maybegender == '{{dim}}': + diminutive = True partconsumed = True - # print 'consumed: ', partconsumed +## print 'consumed: ', partconsumed if not partconsumed: # This must be our term - termweareworkingon=part.replace("[",'').replace("]",'').lower() - if '#' in termweareworkingon and '|' in termweareworkingon: - termweareworkingon=termweareworkingon.split('#')[0] + termweareworkingon = part.replace( + "[", '').replace("]", '').lower() + if '#' in termweareworkingon and \ + '|' in termweareworkingon: + termweareworkingon = termweareworkingon.split( + '#')[0] # Now we have enough information to create a term # object for this translation and add it to our list - addedflag=False + addedflag = False if masculine: - thistrans = {'remark': translationremark, 'trans': term.Term(isolang,termweareworkingon,gender='m',number=number,diminutive=diminutive,wikiline=translation)} + thistrans = {'remark': translationremark, + 'trans': term.Term(isolang, + termweareworkingon, + gender='m', + number=number, + diminutive=diminutive, + wikiline=translation)} translations.append(thistrans) - addedflag=True + addedflag = True if feminine: - thistrans = {'remark': translationremark, 'trans': term.Term(isolang,termweareworkingon,gender='f',number=number,diminutive=diminutive,wikiline=translation)} + thistrans = {'remark': translationremark, + 'trans': term.Term(isolang, + termweareworkingon, + gender='f', + number=number, + diminutive=diminutive, + wikiline=translation)} translations.append(thistrans) - addedflag=True + addedflag = True if neutral: - thistrans = {'remark': translationremark, 'trans': term.Term(isolang,termweareworkingon,gender='n',number=number,diminutive=diminutive,wikiline=translation)} + thistrans = {'remark': translationremark, + 'trans': term.Term(isolang, + termweareworkingon, + gender='n', + number=number, + diminutive=diminutive, + wikiline=translation)} translations.append(thistrans) - addedflag=True + addedflag = True if common: - thistrans = {'remark': translationremark, 'trans': term.Term(isolang,termweareworkingon,gender='c',number=number,diminutive=diminutive,wikiline=translation)} + thistrans = {'remark': translationremark, + 'trans': term.Term(isolang, + termweareworkingon, + gender='c', + number=number, + diminutive=diminutive, + wikiline=translation)} translations.append(thistrans) - addedflag=True - # if it wasn't added by now, it's a term which has no gender indication + addedflag = True + # if it wasn't added by now, it's a term which has no gender + # indication if not addedflag: - thistrans = {'remark': translationremark, 'trans': term.Term(isolang,termweareworkingon,number=number,diminutive=diminutive)} + thistrans = {'remark': translationremark, + 'trans': term.Term(isolang, + termweareworkingon, + number=number, + diminutive=diminutive)} translations.append(thistrans)
if not isolang: - print "Houston, we have a problem. This line doesn't seem to contain an indication of the language:",translationswikiline - self.translations[isolang] = {'remark': translationsremark, - 'alltrans': translations } + print ("This line doesn't seem to contain an indication of the " + "language: %s" % translationswikiline) + self.translations[isolang] = {'remark': translationsremark, + 'alltrans': translations}
def hasSynonyms(self): - """ Returns True if there are synonyms - Returns False if there are no synonyms - """ - if self.synonyms == []: - return False - else: - return True + """ Returns True if there are synonyms else False """ + return bool(self.synonyms)
- def setTranslations(self,translations): + def setTranslations(self, translations): """ Provide the translations """ - self.translations=translations + self.translations = translations
def getTranslations(self): """ Returns the translations dictionary containing translation - Term objects for this meaning + Term objects for this meaning """ return self.translations
- def addTranslation(self,translation): + def addTranslation(self, translation): """ Add a translation Term object to the dictionary for this meaning - The lang property of the Term object will be used as the key of the dictionary - """ - self.translations.setdefault( translation.lang, [] ).append( translation ) + The lang property of the Term object will be used as the key of the + dictionary
- def addTranslations(self,*translations): + """ + self.translations.setdefault(translation.lang, []).append(translation) + + def addTranslations(self, *translations): """ This method calls addTranslation as often as necessary to add - all the translations it receives + all the translations it receives + """ for translation in translations: self.addTranslation(translation)
def hasTranslations(self): - """ Returns True if there are translations - Returns False if there are no translations - """ - if self.translations == {}: - return 0 - else: - return 1 + """ Returns True if there are translations else False """ + return bool(self.translations)
- def setLabel(self,label): - self.label=label.replace('<!--','').replace('-->','') + def setLabel(self, label): + self.label = label.replace('<!--', '').replace('-->', '')
def getLabel(self): if self.label: - return u'<!--' + self.label + u'-->' + return u'<!--%s-->' % self.label
- def setConciseDef(self,concisedef): - self.concisedef=concisedef + def setConciseDef(self, concisedef): + self.concisedef = concisedef
def getConciseDef(self): if self.concisedef: @@ -276,18 +327,22 @@
def getExamples(self): """ Returns the list of example strings for this meaning + """ return self.examples
- def addExample(self,example): + def addExample(self, example): """ Add a translation Term object to the dictionary for this meaning - The lang property of the Term object will be used as the key of the dictionary + The lang property of the Term object will be used as the key of the + dictionary + """ self.examples.append(example)
- def addExamples(self,*examples): + def addExamples(self, *examples): """ This method calls addExample as often as necessary to add - all the examples it receives + all the examples it receives + """ for example in examples: self.addExample(example) @@ -301,94 +356,125 @@ else: return 1
- def wikiWrapSynonyms(self,wikilang): - """ Returns a string with all the synonyms in a format ready for Wiktionary + def wikiWrapSynonyms(self, wikilang): + """ Returns a string with all the synonyms in a format ready for + Wiktionary + """ first = 1 wrappedsynonyms = '' for synonym in self.synonyms: - if first==0: + if first == 0: wrappedsynonyms += ', ' else: first = 0 - wrappedsynonyms = wrappedsynonyms + synonym.wikiWrapForList(wikilang) + wrappedsynonyms += synonym.wikiWrapForList( + wikilang) return wrappedsynonyms + '\n'
- def wikiWrapTranslations(self,wikilang,entrylang): + def wikiWrapTranslations(self, wikilang, entrylang): """ Returns a string with all the translations in a format - ready for Wiktionary - The behavior changes with the circumstances. - For an entry in the same language as the Wiktionary the full list of translations is contained in the output, excluding the local - language itself - - This list of translations has to end up in a table with two columns - - The first column of this table contains languages with names from A to M, the second contains N to Z - - If a column in this list remains empty a html comment is put in that column - For an entry in a foreign language only the translation towards the local language is output. + ready for Wiktionary + The behavior changes with the circumstances. + For an entry in the same language as the Wiktionary the full list of + translations is contained in the output, excluding the local language + itself + - This list of translations has to end up in a table with two columns + - The first column of this table contains languages with names + from A to M, the second contains N to Z + - If a column in this list remains empty a html comment is put in that + column + For an entry in a foreign language only the translation towards the + local language is output. """ if wikilang == entrylang: - # When treating an entry of the same lang as the Wiktionary, we want to output the translations in such a way that they end up sorted alphabetically on the language name in the language of the current Wiktionary - alllanguages=self.translations.keys() + # When treating an entry of the same lang as the Wiktionary, we + # want to output the translations in such a way that they end up + # sorted alphabetically on the language name in the language of the + # current Wiktionary + alllanguages = self.translations.keys() alllanguages.sort(sortonname(langnames[wikilang])) - wrappedtranslations = structs.wiktionaryformats[wikilang]['transbefore'] + '\n' + wrappedtranslations = '%s\n' % ( + structs.wiktionaryformats[wikilang]['transbefore']) alreadydone = 0 for language in alllanguages: - if language == wikilang: continue # don't output translation for the wikilang itself + if language == wikilang: + # don't output translation for the wikilang itself + continue # split translations into two column table - if not alreadydone and langnames[wikilang][language][0:1].upper() > 'M': - wrappedtranslations = wrappedtranslations + structs.wiktionaryformats[wikilang]['transinbetween'] + '\n' + if not alreadydone and \ + langnames[wikilang][language][0:1].upper() > 'M': + wrappedtranslations += structs.wiktionaryformats[ + wikilang]['transinbetween'] + '\n' alreadydone = 1 - # Indicating the language according to the wikiformats dictionary - wrappedtranslations = wrappedtranslations + structs.wiktionaryformats[wikilang]['translang'].replace('%%langname%%',langnames[wikilang][language]).replace('%%ISOLangcode%%',language) + ': ' + # Indicating the language according to the wikiformats + # dictionary + wrappedtranslations += structs.wiktionaryformats[ + wikilang]['translang'].replace( + '%%langname%%', + langnames[wikilang][language]).replace( + '%%ISOLangcode%%', language) + ': ' first = 1 for translation in self.translations[language]: - termweareworkingon=translation.term - if first==0: + termweareworkingon = translation.term + if first == 0: wrappedtranslations += ', ' else: first = 0 - wrappedtranslations = wrappedtranslations + translation.wikiWrapAsTranslation(wikilang) + wrappedtranslations += translation.wikiWrapAsTranslation( + wikilang) wrappedtranslations += '\n' if not alreadydone: - wrappedtranslations = wrappedtranslations + structs.wiktionaryformats[wikilang]['transinbetween'] + '\n' + structs.wiktionaryformats[wikilang]['transnoNtoZ'] + '\n' + wrappedtranslations += structs.wiktionaryformats[ + wikilang]['transinbetween'] + '\n' + \ + structs.wiktionaryformats[wikilang]['transnoNtoZ'] + '\n' alreadydone = 1 - wrappedtranslations = wrappedtranslations + structs.wiktionaryformats[wikilang]['transafter'] + '\n' + wrappedtranslations += structs.wiktionaryformats[ + wikilang]['transafter'] + '\n' else: - # For the other entries we want to output the translation in the language of the Wiktionary - wrappedtranslations = structs.wiktionaryformats[wikilang]['translang'].replace('%%langname%%',langnames[wikilang][wikilang]).replace('%%ISOLangcode%%',wikilang) + ': ' + # For the other entries we want to output the translation in the + # language of the Wiktionary + wrappedtranslations = structs.wiktionaryformats[ + wikilang]['translang'].replace('%%langname%%', + langnames[ + wikilang][wikilang]).replace( + '%%ISOLangcode%%', + wikilang) + ': ' first = True for translation in self.translations[wikilang]: - termweareworkingon=translation.term - if first==False: + termweareworkingon = translation.term + if not first: wrappedtranslations += ', ' else: first = False - wrappedtranslations = wrappedtranslations + translation.wikiWrapAsTranslation(wikilang) + wrappedtranslations += translation.wikiWrapAsTranslation( + wikilang) return wrappedtranslations
- def showContents(self,indentation): + def showContents(self, indentation): """ Prints the contents of this meaning. - Every subobject is indented a little further on the screen. - The primary purpose is to help keep one's sanity while debugging. + Every subobject is indented a little further on the screen. + The primary purpose is to help keep one's sanity while debugging. """ print ' ' * indentation + 'term: ' - self.term.showContents(indentation+2) - print ' ' * indentation + 'definition = %s'% self.definition - print ' ' * indentation + 'etymology = %s'% self.etymology - + self.term.showContents(indentation + 2) + print ' ' * indentation + 'definition = %s' % self.definition + print ' ' * indentation + 'etymology = %s' % self.etymology print ' ' * indentation + 'Synonyms:' for synonym in self.synonyms: - synonym.showContents(indentation+2) - + synonym.showContents(indentation + 2) print ' ' * indentation + 'Translations:' translationkeys = self.translations.keys() for translationkey in translationkeys: for translation in self.translations[translationkey]: - translation.showContents(indentation+2) + translation.showContents(indentation + 2)
def wikiWrapExamples(self): - """ Returns a string with all the examples in a format ready for Wiktionary + """ Returns a string with all the examples in a format ready for + Wiktionary + """ wrappedexamples = '' for example in self.examples: - wrappedexamples = wrappedexamples + "#:'''" + example + "'''\n" + wrappedexamples += "#:'''%s'''\n" % example return wrappedexamples diff --git a/wiktionary/meaningtest.py b/wiktionary/meaningtest.py index 033db79..8c6d1cc 100644 --- a/wiktionary/meaningtest.py +++ b/wiktionary/meaningtest.py @@ -6,57 +6,65 @@ import meaning import unittest
+ class KnownValues(unittest.TestCase):
knownParserValues = ( - ("*German: [[wichtig]]", - [('de','wichtig','',1,False,'')] - ), - ("*[[Esperanto]]: [[grava]]", - [('eo','grava','',1,False,'')] - ), - ("*{{fr}}: [[importante]] {{f}}", - [('fr','importante','f',1,False,'')] - ), - ("*Dutch: [[voorbeelden]] ''n, pl'', [[instructies]] {{f}}, {{p}}", - [('nl','voorbeelden','n',2,False,''), - ('nl','instructies', 'f',2,False,'')] - ), - ("*Russian: [[шесток]] ''m'' (shestok)", - [('ru','шесток','m',1,False,'shestok')] - ), - ("*Kazakh: сәлем, салам, сәлеметсіздер(respectable)", - [('ka','сәлем','',1,False,''), - ('ka','салам','',1,False,''), - ('ka','сәлеметсіздер','',1,False,'respectable')] - ), - ("*Chinese(Mandarin):[[你好]](ni3 hao3), [[您好]](''formal'' nin2 hao3)", - [('zh','你好','',1,False,'ni3 hao3'), - ('zh','您好','',1,False,"''formal'' nin2 hao3")] - ), - ("*German: [[Lamm]] ''n'' [[:de:Lamm|(de)]]", - [('de','Lamm','n',1,False,'')] - ), - ("*Italian: [[pronto#Italian|pronto]]", - [('it','pronto','',1,False,'')] - ), - ) + ("*German: [[wichtig]]", + [('de', 'wichtig', '', 1, False, '')] + ), + ("*[[Esperanto]]: [[grava]]", + [('eo', 'grava', '', 1, False, '')] + ), + ("*{{fr}}: [[importante]] {{f}}", + [('fr', 'importante', 'f', 1, False, '')] + ), + ("*Dutch: [[voorbeelden]] ''n, pl'', [[instructies]] {{f}}, {{p}}", + [('nl', 'voorbeelden', 'n', 2, False, ''), + ('nl', 'instructies', 'f', 2, False, '')] + ), + ("*Russian: [[шесток]] ''m'' (shestok)", + [('ru', 'шесток', 'm', 1, False, 'shestok')] + ), + ("*Kazakh: сәлем, салам, сәлеметсіздер(respectable)", + [('ka', 'сәлем', '', 1, False, ''), + ('ka', 'салам', '', 1, False, ''), + ('ka', 'сәлеметсіздер', '', 1, False, 'respectable')] + ), + ("*Chinese(Mandarin):[[你好]](ni3 hao3), [[您好]](''formal'' nin2 hao3)", + [('zh', '你好', '', 1, False, 'ni3 hao3'), + ('zh', '您好', '', 1, False, "''formal'' nin2 hao3")] + ), + ("*German: [[Lamm]] ''n'' [[:de:Lamm|(de)]]", + [('de', 'Lamm', 'n', 1, False, '')] + ), + ("*Italian: [[pronto#Italian|pronto]]", + [('it', 'pronto', '', 1, False, '')] + ), + )
def testParser(self): - '''self.term, self.gender, self.number, self.diminutive and remark parsed correctly from Wiki format''' + '''self.term, self.gender, self.number, self.diminutive and remark + parsed correctly from Wiki format + + ''' for wikiline, results in self.knownParserValues: ameaning = meaning.Meaning('en', 'dummy') ameaning.parseTranslations(wikiline) - i=0 - for termlang, thisterm, termgender, termnumber, termisadiminutive, remark in results: - resultterm = ameaning.translations[termlang]['alltrans'][i]['trans'] + i = 0 + for termlang, thisterm, termgender, termnumber, termisadiminutive, \ + remark in results: + resultterm = ameaning.translations[ + termlang]['alltrans'][i]['trans'] self.assertEqual(resultterm.getTerm(), thisterm) self.assertEqual(resultterm.getGender(), termgender) self.assertEqual(resultterm.getNumber(), termnumber) -# self.assertEqual(resultterm.getIsDiminutive(), termisadiminutive) - self.assertEqual(ameaning.translations[termlang]['alltrans'][i]['remark'], remark) - i+=1 +## self.assertEqual(resultterm.getIsDiminutive(), +## termisadiminutive) + self.assertEqual( + ameaning.translations[termlang]['alltrans'][i]['remark'], + remark) + i += 1
if __name__ == "__main__": unittest.main() - diff --git a/wiktionary/sortonlanguagename.py b/wiktionary/sortonlanguagename.py index 090faec..73d4312 100755 --- a/wiktionary/sortonlanguagename.py +++ b/wiktionary/sortonlanguagename.py @@ -2,13 +2,16 @@ # -*- coding: utf-8 -*-
# A big thanks to Rob Hooft for the following class: -# It may not seem like much, but it magically allows the translations to be sorted on -# the names of the languages. I would never have thought of doing it like this myself. +# It may not seem like much, but it magically allows the translations to be +# sorted on the names of the languages. I would never have thought of doing it +# like this myself. +
class sortonlanguagename: ''' This class sorts translations alphabetically on the name of the language, instead of on the iso abbreviation that is used internally. + ''' def __init__(self, lang): self.lang = lang diff --git a/wiktionary/structs.py b/wiktionary/structs.py index ce19f6c..946a6b8 100644 --- a/wiktionary/structs.py +++ b/wiktionary/structs.py @@ -5,7 +5,21 @@ Basic structures for wiktionary.py '''
-isolangs = ['af','sq','ar','an','hy','ast','tay','ay','az','bam','eu','bn','my','bi','bs','br','bg','sro','ca','zh','chp','rmr','co','dgd','da','de','eml','en','eo','et','fo','fi','fr','cpf','fy','fur','gl','ka','el','gu','hat','haw','he','hi','hu','io','ga','is','gil','id','ia','it','ja','jv','ku','kok','ko','hr','lad','la','lv','ln','li','lt','lb','src','ma','ms','mg','mt','mnc','mi','mr','mh','mas','myn','mn','nah','nap','na','nds','no','ny','oc','uk','oen','grc','pau','pap','pzh','fa','pl','pt','pa','qu','rap','roh','ra','ro','ja-ro','ru','smi','sm','sa','sc','sco','sr','sn','si','sk','sl','so','sov','es','scn','su','sw','tl','tt','th','ti','tox','cs','che','tn','tum','tpn','tr','ts','tvl','ur','vi','vo','wa','cy','be','wo','xh','zu','sv'] +isolangs = ['af', 'an', 'ar', 'ast', 'ay', 'az', 'bam', 'be', 'bg', 'bi', 'bn', + 'br', 'bs', 'ca', 'che', 'chp', 'co', 'cpf', 'cs', 'cy', 'da', 'de', + 'dgd', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', + 'fr', 'fur', 'fy', 'ga', 'gil', 'gl', 'grc', 'gu', 'hat', 'haw', + 'he', 'hi', 'hr', 'hu', 'hy', 'ia', 'id', 'io', 'is', 'it', 'ja', + 'ja-ro', 'jv', 'ka', 'ko', 'kok', 'ku', 'la', 'lad', 'lb', 'li', + 'ln', 'lt', 'lv', 'ma', 'mas', 'mg', 'mh', 'mi', 'mn', 'mnc', 'mr', + 'ms', 'mt', 'my', 'myn', 'na', 'nah', 'nap', 'nds', 'no', 'ny', + 'oc', 'oen', 'pa', 'pap', 'pau', 'pl', 'pt', 'pzh', 'qu', 'ra', + 'rap', 'rmr', 'ro', 'roh', 'ru', 'sa', 'sc', 'scn', 'sco', 'si', + 'sk', 'sl', 'sm', 'smi', 'sn', 'so', 'sov', 'sq', 'sr', 'src', + 'sro', 'su', 'sv', 'sw', 'tay', 'th', 'ti', 'tl', 'tn', 'tox', + 'tpn', 'tr', 'ts', 'tt', 'tum', 'tvl', 'uk', 'ur', 'vi', 'vo', 'wa', + 'wo', 'xh', 'zh', 'zu', + ]
wiktionaryformats = { 'nl': { @@ -15,10 +29,10 @@ 'afterexampleterm': u"'''", 'gender': u"{{%%gender%%}}", 'posheader': { - 'noun': u'{{-noun-}}', - 'adjective': u'{{-adj-}}', - 'verb': u'{{-verb-}}', - }, + 'noun': u'{{-noun-}}', + 'adjective': u'{{-adj-}}', + 'verb': u'{{-verb-}}', + }, 'translationsheader': u"{{-trans-}}", 'transbefore': u'{{top}}', 'transinbetween': u'{{mid}}', @@ -27,7 +41,7 @@ 'transnoNtoZ': u'<!-- Vertalingen van N tot Z komen hier-->', 'synonymsheader': u"{{-syn-}}", 'relatedheader': u'{{-rel-}}', - }, + }, 'en': { 'langheader': u'==%%langname%%==', 'translang': u'*%%langname%%', @@ -35,10 +49,10 @@ 'afterexampleterm': u"'''", 'gender': u"''%%gender%%''", 'posheader': { - 'noun': u'===Noun===', - 'adjective': u'===Adjective===', - 'verb': u'===Verb===', - }, + 'noun': u'===Noun===', + 'adjective': u'===Adjective===', + 'verb': u'===Verb===', + }, 'translationsheader': u"====Translations====", 'transbefore': u'{{top}}', 'transinbetween': u'{{mid}}', @@ -47,7 +61,7 @@ 'transnoNtoZ': u'<!-- Translations from N tot Z go here-->', 'synonymsheader': u"====Synonyms====", 'relatedheader': u'===Related words===', - } + } }
pos = { @@ -76,87 +90,88 @@ }
langnames = { - 'nl': { - 'translingual' : u'Taalonafhankelijk', - 'nl' : u'Nederlands', - 'en' : u'Engels', - 'de' : u'Duits', - 'fr' : u'Frans', - 'it' : u'Italiaans', - 'eo' : u'Esperanto', - 'es' : u'Spaans', - }, - 'de': { - 'translingual' : u'???', - 'nl' : u'Niederländisch', - 'en' : u'Englisch', - 'de' : u'Deutsch', - 'fr' : u'Französisch', - 'it' : u'Italienisch', - 'eo' : u'Esperanto', - 'es' : u'Spanisch', - }, - 'en': { - 'translingual' : u'Translingual', - 'nl' : u'Dutch', - 'en' : u'English', - 'de' : u'German', - 'fr' : u'French', - 'it' : u'Italian', - 'eo' : u'Esperanto', - 'es' : u'Spanish', - }, - 'eo': { - 'translingual' : u'???', - 'nl' : u'Nederlanda', - 'en' : u'Angla', - 'de' : u'Germana', - 'fr' : u'Franca', - 'it' : u'Italiana', - 'eo' : u'Esperanto', - 'es' : u'Hispana', - }, - 'ia': { - 'translingual' : u'translingual', - 'nl' : u'nederlandese', - 'en' : u'anglese', - 'de' : u'germano', - 'fr' : u'francese', - 'it' : u'italiano', - 'eo' : u'esperanto', - 'es' : u'espaniol', - }, - 'it': { - 'translingual' : u'???', - 'nl' : u'olandese', - 'en' : u'inglese', - 'de' : u'tedesco', - 'fr' : u'francese', - 'it' : u'italiano', - 'eo' : u'esperanto', - 'es' : u'spagnuolo', - }, - 'fr': { - 'translingual' : u'???', - 'nl' : u'néerlandais', - 'en' : u'anglais', - 'de' : u'allemand', - 'fr' : u'français', - 'it' : u'italien', - 'eo' : u'espéranto', - 'es' : u'espagnol', - }, - 'es': { - 'translingual' : u'???', - 'nl' : u'olandés', - 'en' : u'inglés', - 'de' : u'alemán', - 'fr' : u'francés', - 'it' : u'italiano', - 'eo' : u'esperanto', - 'es' : u'español', - }, + 'nl': { + 'translingual': u'Taalonafhankelijk', + 'nl': u'Nederlands', + 'en': u'Engels', + 'de': u'Duits', + 'fr': u'Frans', + 'it': u'Italiaans', + 'eo': u'Esperanto', + 'es': u'Spaans', + }, + 'de': { + 'translingual': u'???', + 'nl': u'Niederländisch', + 'en': u'Englisch', + 'de': u'Deutsch', + 'fr': u'Französisch', + 'it': u'Italienisch', + 'eo': u'Esperanto', + 'es': u'Spanisch', + }, + 'en': { + 'translingual': u'Translingual', + 'nl': u'Dutch', + 'en': u'English', + 'de': u'German', + 'fr': u'French', + 'it': u'Italian', + 'eo': u'Esperanto', + 'es': u'Spanish', + }, + 'eo': { + 'translingual': u'???', + 'nl': u'Nederlanda', + 'en': u'Angla', + 'de': u'Germana', + 'fr': u'Franca', + 'it': u'Italiana', + 'eo': u'Esperanto', + 'es': u'Hispana', + }, + 'ia': { + 'translingual': u'translingual', + 'nl': u'nederlandese', + 'en': u'anglese', + 'de': u'germano', + 'fr': u'francese', + 'it': u'italiano', + 'eo': u'esperanto', + 'es': u'espaniol', + }, + 'it': { + 'translingual': u'???', + 'nl': u'olandese', + 'en': u'inglese', + 'de': u'tedesco', + 'fr': u'francese', + 'it': u'italiano', + 'eo': u'esperanto', + 'es': u'spagnuolo', + }, + 'fr': { + 'translingual': u'???', + 'nl': u'néerlandais', + 'en': u'anglais', + 'de': u'allemand', + 'fr': u'français', + 'it': u'italien', + 'eo': u'espéranto', + 'es': u'espagnol', + }, + 'es': { + 'translingual': u'???', + 'nl': u'olandés', + 'en': u'inglés', + 'de': u'alemán', + 'fr': u'francés', + 'it': u'italiano', + 'eo': u'esperanto', + 'es': u'español', + }, } +
def invertlangnames(): ''' @@ -164,58 +179,83 @@ parsing we need a dictionary to efficiently convert these back to iso abbreviations. The dictionary that gets created also contains common misspellings + ''' invertedlangnames = {} for ISOKey in langnames.keys(): for ISOKey2 in langnames[ISOKey].keys(): - lowercaselangname=langnames[ISOKey][ISOKey2].lower() - #Put in the names of the languages so we can easily do a reverse lookup lang name -> iso abbreviation + lowercaselangname = langnames[ISOKey][ISOKey2].lower() + # Put in the names of the languages so we can easily do a reverse + # lookup lang name -> iso abbreviation invertedlangnames.setdefault(lowercaselangname, ISOKey2) - # Now all the correct forms are in, but we also want to be able to find them when there are typos in them - for index in range(1,len(lowercaselangname)): + # Now all the correct forms are in, but we also want to be able to + # find them when there are typos in them + for index in range(1, len(lowercaselangname)): # So first we create all the possibilities with one letter gone - invertedlangnames.setdefault(lowercaselangname[:index]+lowercaselangname[index+1:], ISOKey2) + invertedlangnames.setdefault( + lowercaselangname[:index] + lowercaselangname[index + 1:], + ISOKey2) # Then we switch two consecutive letters - invertedlangnames.setdefault(lowercaselangname[:index-1]+lowercaselangname[index]+lowercaselangname[index-1]+lowercaselangname[index+1:], ISOKey2) - # There are of course other typos possible, but this caters for a lot of possibilities already - # TODO One other treatment that would make sense is to filter out the accents. + invertedlangnames.setdefault( + lowercaselangname[:index - 1] + + lowercaselangname[index] + + lowercaselangname[index - 1] + + lowercaselangname[index + 1:], + ISOKey2) + # There are of course other typos possible, but this caters for + # a lot of possibilities already + # TODO One other treatment that would make sense is to filter + # out the accents. return invertedlangnames +
def createPOSlookupDict(): ''' The dictionary for looking up parts of speech gets completed with common misspellings + ''' for key in pos.keys(): - lowercasekey=key.lower() - value=pos[key] - for index in range(1,len(lowercasekey)): + lowercasekey = key.lower() + value = pos[key] + for index in range(1, len(lowercasekey)): # So first we create all the possibilities with one letter gone - pos.setdefault(lowercasekey[:index]+lowercasekey[index+1:], value) + pos.setdefault(lowercasekey[:index] + lowercasekey[index + 1:], + value) # Then we switch two consecutive letters - pos.setdefault(lowercasekey[:index-1]+lowercasekey[index]+lowercasekey[index-1]+lowercasekey[index+1:], value) - # There are of course other typos possible, but this caters for a lot of possibilities already + pos.setdefault(lowercasekey[:index - 1] + lowercasekey[index] + + lowercasekey[index - 1] + lowercasekey[index + 1:], + value) + # There are of course other typos possible, but this caters for a + # lot of possibilities already return pos +
def createOtherHeaderslookupDict(): ''' The dictionary for looking up names of other headers gets completed with common misspellings + ''' for key in otherheaders.keys(): - lowercasekey=key.lower() - value=otherheaders[key] - for index in range(1,len(lowercasekey)): + lowercasekey = key.lower() + value = otherheaders[key] + for index in range(1, len(lowercasekey)): # So first we create all the possibilities with one letter gone - otherheaders.setdefault(lowercasekey[:index]+lowercasekey[index+1:], value) + otherheaders.setdefault(lowercasekey[:index] + + lowercasekey[index + 1:], value) # Then we switch two consecutive letters - otherheaders.setdefault(lowercasekey[:index-1]+lowercasekey[index]+lowercasekey[index-1]+lowercasekey[index+1:], value) - # There are of course other typos possible, but this caters for a lot of possibilities already + otherheaders.setdefault(lowercasekey[:index - 1] + + lowercasekey[index] + + lowercasekey[index - 1] + + lowercasekey[index + 1:], value) + # There are of course other typos possible, but this caters for a + # lot of possibilities already return otherheaders
# Execute the functions that will take care of setting up and completing # lookup dictionaries for stuff that can appear in headers. -invertedlangnames=invertlangnames() +invertedlangnames = invertlangnames() createPOSlookupDict() createOtherHeaderslookupDict() diff --git a/wiktionary/term.py b/wiktionary/term.py index edef58e..a874a1a 100644 --- a/wiktionary/term.py +++ b/wiktionary/term.py @@ -3,185 +3,238 @@
import structs
+ class Term: """ This is a superclass for terms. """ - def __init__(self,lang,term,relatedwords=[],gender='',number=1,diminutive=False,wikiline=u''): - """ Constructor - Generally called with two parameters: - - The language of the term - - The term (string)
- - relatedwords (list of Term objects) is optional + def __init__(self, lang, term, relatedwords=None, gender='', number=1, + diminutive=False, wikiline=u''): + """ Constructor + Generally called with two parameters: + - The language of the term + - The term (string) + + - relatedwords (list of Term objects) is optional """ - self.lang=lang - self.term=term - self.relatedwords=relatedwords - self.gender=gender # m: masculine, f: feminine, n: neutral, c: common - self.number=number # 1: singular, 2: plural - self.diminutive=diminutive # True: diminutive, False: not a diminutive + self.lang = lang + self.term = term + if relatedwords is None: + self.relatedwords = [] + else: + self.relatedwords = relatedwords + self.gender = gender # m: masculine, f: feminine, n: neutral, c: common + self.number = number # 1: singular, 2: plural + self.diminutive = diminutive
if wikiline: - pos=wikiline.find("''") - if pos==-1: - pos=wikiline.find("{{") - if pos==-1: - pos=len(wikiline) - maybegender=wikiline[pos:].replace("'",'').replace('{','').replace('}','').strip() - self.term=wikiline[:pos].replace("[",'').replace(']','').strip() + pos = wikiline.find("''") + if pos == -1: + pos = wikiline.find("{{") + if pos == -1: + pos = len(wikiline) + maybegender = wikiline[pos:].replace("'", '').replace( + '{', '').replace('}', '').strip() + self.term = wikiline[:pos].replace("[", '').replace(']', '').strip() if 'm' in maybegender: - self.gender='m' + self.gender = 'm' if 'f' in maybegender: - self.gender='f' + self.gender = 'f' if 'n' in maybegender: - self.gender='n' + self.gender = 'n' if 'c' in maybegender: - self.gender='c' + self.gender = 'c' if 'p' in maybegender: - self.number=2 + self.number = 2 if 'dim' in maybegender: - self.diminutive=True + self.diminutive = True
def __getitem__(self): """ Documenting as an afterthought is a bad idea - I don't know anymore why I added this, but I'm pretty sure it was in response to an error message + I don't know anymore why I added this, but I'm pretty sure it was in + response to an error message + """ return self
- def setTerm(self,term): - self.term=term + def setTerm(self, term): + self.term = term
def getTerm(self): return self.term
- def setLang(self,lang): - self.lang=lang + def setLang(self, lang): + self.lang = lang
def getLang(self): return self.lang
- def setGender(self,gender): - self.gender=gender + def setGender(self, gender): + self.gender = gender
def getGender(self): - return(self.gender) + return self.gender
- def setNumber(self,number): - self.number=number + def setNumber(self, number): + self.number = number
def getNumber(self): - return(self.number) + return self.number
-# def setLabel(self,label): -# self.label=label.replace('<!--','').replace('-->','') +## def setLabel(self,label): +## self.label = label.replace('<!--', '').replace('-->', '')
-# def getLabel(self): -# if self.label: -# return '<!--' + self.label + '-->' +## def getLabel(self): +## if self.label: +## return '<!--%s-->' % self.label
- def wikiWrapGender(self,wikilang): - """ Returns a string with the gender in a format ready for Wiktionary, if it is applicable + def wikiWrapGender(self, wikilang): + """ Returns a string with the gender in a format ready for Wiktionary, + if it is applicable + """ if self.gender: - return ' ' + structs.wiktionaryformats[wikilang]['gender'].replace('%%gender%%',self.gender) + return ' %s' % ( + structs.wiktionaryformats[wikilang]['gender'].replace( + '%%gender%%', self.gender)) else: return ''
- def wikiWrapAsExample(self,wikilang): - """ Returns a string with the gender in a format ready for Wiktionary, if it exists - """ - return structs.wiktionaryformats[wikilang]['beforeexampleterm'] + self.term + structs.wiktionaryformats[wikilang]['afterexampleterm'] + def wikiWrapAsExample(self, wikilang): + """ Returns a string with the gender in a format ready for Wiktionary, + if it exists
- def wikiWrapForList(self,wikilang): - """ Returns a string with this term as a link followed by the gender in a format ready for Wiktionary """ - return '[[' + self.term + ']]' + return structs.wiktionaryformats[wikilang][ + 'beforeexampleterm'] + self.term + structs.wiktionaryformats[ + wikilang]['afterexampleterm']
- def wikiWrapAsTranslation(self,wikilang): - """ Returns a string with this term as a link followed by the gender in a format ready for Wiktionary + def wikiWrapForList(self, wikilang): + """ Returns a string with this term as a link followed by the gender + in a format ready for Wiktionary + """ - return '[[' + self.term + ']]' + return '[[%s]]' % self.term
- def showContents(self,indentation): + def wikiWrapAsTranslation(self, wikilang): + """ Returns a string with this term as a link followed by the gender + in a format ready for Wiktionary + + """ + return '[[%s]]' % self.term + + def showContents(self, indentation): """ Prints the contents of this Term. - Every subobject is indented a little further on the screen. - The primary purpose is to help keep one's sanity while debugging. + Every subobject is indented a little further on the screen. + The primary purpose is to help keep one's sanity while debugging. + """ - print ' ' * indentation + 'lang = %s'% self.lang - print ' ' * indentation + 'pos = %s'% self.pos - print ' ' * indentation + 'term = %s'% self.term - print ' ' * indentation + 'relatedwords = %s'% self.relatedwords + print ' ' * indentation + 'lang = %s' % self.lang + print ' ' * indentation + 'pos = %s' % self.pos + print ' ' * indentation + 'term = %s' % self.term + print ' ' * indentation + 'relatedwords = %s' % self.relatedwords +
class Noun(Term): """ This class inherits from Term. - It adds properties and methods specific to nouns + It adds properties and methods specific to nouns + """ - def __init__(self,lang,term,gender='',number=1,diminutive=False): + def __init__(self, lang, term, gender='', number=1, diminutive=False): """ Constructor - Generally called with two parameters: - - The language of the term - - The term (string) + Generally called with two parameters: + - The language of the term + - The term (string)
- - gender is optional + - gender is optional + """ - self.pos='noun' # part of speech - Term.__init__(self,lang,term,gender=gender,number=number,diminutive=diminutive) + self.pos = 'noun' # part of speech + super(Noun, self).__init__(self, lang, term, gender=gender, + number=number, diminutive=diminutive)
- def showContents(self,indentation): - Term.showContents(self,indentation) - print ' ' * indentation + 'gender = %s'% self.gender + def showContents(self, indentation): + Term.showContents(self, indentation) + print ' ' * indentation + 'gender = %s' % self.gender
- def wikiWrapAsExample(self,wikilang): - """ Returns a string with the gender in a format ready for Wiktionary, if it exists + def wikiWrapAsExample(self, wikilang): + """ Returns a string with the gender in a format ready for Wiktionary, + if it exists + """ - return Term.wikiWrapAsExample(self, wikilang) + Term.wikiWrapGender(self,wikilang) + return Term.wikiWrapAsExample( + self, wikilang) + Term.wikiWrapGender(self, wikilang)
- def wikiWrapForList(self,wikilang): - """ Returns a string with this term as a link followed by the gender in a format ready for Wiktionary - """ - return Term.wikiWrapForList(self, wikilang) + Term.wikiWrapGender(self, wikilang) + def wikiWrapForList(self, wikilang): + """ Returns a string with this term as a link followed by the gender in + a format ready for Wiktionary
- def wikiWrapAsTranslation(self,wikilang): - """ Returns a string with this term as a link followed by the gender in a format ready for Wiktionary """ - return Term.wikiWrapAsTranslation(self, wikilang) + Term.wikiWrapGender(self, wikilang) + return Term.wikiWrapForList( + self, wikilang) + Term.wikiWrapGender(self, wikilang) + + def wikiWrapAsTranslation(self, wikilang): + """ Returns a string with this term as a link followed by the gender + in a format ready for Wiktionary + + """ + return Term.wikiWrapAsTranslation( + self, wikilang) + Term.wikiWrapGender(self, wikilang) +
class Adjective(Term): - def __init__(self,lang,term,gender='',number=1): - self.pos='adjective' # part of speech - Term.__init__(self,lang,term,gender=gender,number=number)
- def wikiWrapAsExample(self,wikilang): - """ Returns a string with the gender in a format ready for Wiktionary, if it exists - """ - return Term.wikiWrapAsExample(self, wikilang) + Term.wikiWrapGender(self,wikilang) + def __init__(self, lang, term, gender='', number=1): + self.pos = 'adjective' # part of speech + super(Adjective, self).__init__(self, lang, term, gender=gender, + number=number)
- def wikiWrapForList(self,wikilang): - """ Returns a string with this term as a link followed by the gender in a format ready for Wiktionary - """ - return Term.wikiWrapForList(self, wikilang) + Term.wikiWrapGender(self, wikilang) + def wikiWrapAsExample(self, wikilang): + """ Returns a string with the gender in a format ready for Wiktionary, + if it exists
- def wikiWrapAsTranslation(self,wikilang): - """ Returns a string with this term as a link followed by the gender in a format ready for Wiktionary """ - return Term.wikiWrapAsTranslation(self, wikilang) + Term.wikiWrapGender(self, wikilang) + return Term.wikiWrapAsExample( + self, wikilang) + Term.wikiWrapGender(self, wikilang) + + def wikiWrapForList(self, wikilang): + """ Returns a string with this term as a link followed by the gender in + a format ready for Wiktionary + + """ + return Term.wikiWrapForList( + self, wikilang) + Term.wikiWrapGender(self, wikilang) + + def wikiWrapAsTranslation(self, wikilang): + """ Returns a string with this term as a link followed by the gender + in a format ready for Wiktionary + + """ + return Term.wikiWrapAsTranslation( + self, wikilang) + Term.wikiWrapGender(self, wikilang) +
class Verb(Term): - def __init__(self,lang,term): - self.pos='verb' # part of speech - Term.__init__(self,lang,term)
- def showContents(self,indentation): - Term.showContents(self,indentation) + def __init__(self, lang, term): + self.pos = 'verb' # part of speech + super(Verb, self).__init__(self, lang, term)
- def wikiWrapForList(self,wikilang): - """ Returns a string with this term as a link in a format ready for Wiktionary + def showContents(self, indentation): + Term.showContents(self, indentation) + + def wikiWrapForList(self, wikilang): + """ Returns a string with this term as a link in a format ready for + Wiktionary + """ - if wikilang=='en': + if wikilang == 'en': if self.term.lower().startswith('to '): - return 'to [[' + self.term[3:] + ']]' + return 'to [[%s]]' % self.term[3:] return Term.wikiWrapForList(self, wikilang)
- def wikiWrapAsTranslation(self,wikilang): - """ Returns a string with this term as a link in a format ready for Wiktionary + def wikiWrapAsTranslation(self, wikilang): + """ Returns a string with this term as a link in a format ready for + Wiktionary + """ return Verb.wikiWrapForList(self, wikilang) diff --git a/wiktionary/termtest.py b/wiktionary/termtest.py index 1d71db3..7f4bf2e 100755 --- a/wiktionary/termtest.py +++ b/wiktionary/termtest.py @@ -6,59 +6,69 @@ import term import unittest
+ class KnownValues(unittest.TestCase): knownValues = ( - ('en','noun','en','example','', "'''example'''", '[[example]]'), - ('en','noun','nl','voorbeeld','n', "'''voorbeeld''' ''n''", "[[voorbeeld]] ''n''"), - ('nl','noun','nl','voorbeeld','n', "'''voorbeeld''' {{n}}", "[[voorbeeld]] {{n}}"), - ('en','verb','en','to show','', "'''to show'''", 'to [[show]]'), - ('en','verb','nl','tonen','', "'''tonen'''", "[[tonen]]"), - ('nl','verb','nl','tonen','', "'''tonen'''", "[[tonen]]"), - ) + ('en', 'noun', 'en', 'example', '', "'''example'''", '[[example]]'), + ('en', 'noun', 'nl', 'voorbeeld', 'n', "'''voorbeeld''' ''n''", + "[[voorbeeld]] ''n''"), + ('nl', 'noun', 'nl', 'voorbeeld', 'n', "'''voorbeeld''' {{n}}", + "[[voorbeeld]] {{n}}"), + ('en', 'verb', 'en', 'to show', '', "'''to show'''", 'to [[show]]'), + ('en', 'verb', 'nl', 'tonen', '', "'''tonen'''", "[[tonen]]"), + ('nl', 'verb', 'nl', 'tonen', '', "'''tonen'''", "[[tonen]]"), + )
def testTermKnownValuesWikiWrapAsExample(self): """WikiWrap output correct for a term used as an example""" - for wikilang, pos, termlang, thisterm, termgender, asexample, forlist in self.knownValues: - if pos=='noun': + for wikilang, pos, termlang, thisterm, termgender, asexample, \ + forlist in self.knownValues: + if pos == 'noun': aterm = term.Noun(termlang, thisterm, gender=termgender) - if pos=='verb': + if pos == 'verb': aterm = term.Verb(termlang, thisterm) result = aterm.wikiWrapAsExample(wikilang) self.assertEqual(asexample, result)
def testTermKnownValuesWikiWrapForList(self): """WikiWrap output correct for a term when used in a list""" - for wikilang, pos, termlang, thisterm, termgender, asexample, forlist in self.knownValues: - if pos=='noun': + for wikilang, pos, termlang, thisterm, termgender, asexample, \ + forlist in self.knownValues: + if pos == 'noun': aterm = term.Noun(termlang, thisterm, gender=termgender) - if pos=='verb': + if pos == 'verb': aterm = term.Verb(termlang, thisterm) result = aterm.wikiWrapForList(wikilang) self.assertEqual(forlist, result)
def testTermKnownValuesWikiWrapAsTranslation(self): """WikiWrap output correct for a term when used as a translation""" - for wikilang, pos, termlang, thisterm, termgender, asexample, forlist in self.knownValues: - if pos=='noun': + for wikilang, pos, termlang, thisterm, termgender, asexample, \ + forlist in self.knownValues: + if pos == 'noun': aterm = term.Noun(termlang, thisterm, gender=termgender) - if pos=='verb': + if pos == 'verb': aterm = term.Verb(termlang, thisterm) result = aterm.wikiWrapAsTranslation(wikilang) self.assertEqual(forlist, result)
knownParserValues = ( - ("[[example]] ",'en','example','',1), - ("[[voorbeeld]] ''n''",'nl','voorbeeld','n',1), - ("[[voorbeeld]] {{n}}",'nl','voorbeeld','n',1), - ("[[voorbeelden]] ''n, pl''",'nl','voorbeelden','n',2), - ("[[voorbeelden]] {{n}},{{p}}",'nl','voorbeelden','n',2), -# ("to [[show]]",'en','to show','',1), - ("[[tonen]]",'nl','tonen','',1), - ) + ("[[example]] ", 'en', 'example', '', 1), + ("[[voorbeeld]] ''n''", 'nl', 'voorbeeld', 'n', 1), + ("[[voorbeeld]] {{n}}", 'nl', 'voorbeeld', 'n', 1), + ("[[voorbeelden]] ''n, pl''", 'nl', 'voorbeelden', 'n', 2), + ("[[voorbeelden]] {{n}},{{p}}", 'nl', 'voorbeelden', 'n', 2), +## ("to [[show]]", 'en', 'to show', '', 1), + ("[[tonen]]", 'nl', 'tonen', '', 1), + )
def testParser(self): - '''self.term, self.gender and self.number parsed correctly from Wiki format''' - for wikiline, termlang, thisterm, termgender, termnumber in self.knownParserValues: + '''self.term, self.gender and self.number parsed correctly from Wiki + format + + ''' + for wikiline, termlang, thisterm, termgender, termnumber in \ + self.knownParserValues: aterm = term.Term(termlang, '', wikiline=wikiline) self.assertEqual(aterm.getTerm(), thisterm) self.assertEqual(aterm.getGender(), termgender) @@ -66,4 +76,3 @@
if __name__ == "__main__": unittest.main() - diff --git a/wiktionary/wiktionarypage.py b/wiktionary/wiktionarypage.py index da1506c..77efae1 100644 --- a/wiktionary/wiktionarypage.py +++ b/wiktionary/wiktionarypage.py @@ -3,11 +3,17 @@
''' This module contains code to store Wiktionary content in Python objects. -The objects can output the content again in Wiktionary format by means of the wikiWrap methods +The objects can output the content again in Wiktionary format by means of the +wikiWrap methods
-I'm currently working on a parser that can read the textual version in the various Wiktionary formats and store what it finds in the Python objects. +I'm currently working on a parser that can read the textual version in the +various Wiktionary formats and store what it finds in the Python objects.
-The code is still very much alpha level and the scope of what it can do is still rather limited, only 3 parts of speech, only 2 different Wiktionary output formats, only langnames matrix for about 8 languages. One of the things on the todo list is to harvest the content of this matrix dictionary from the various Wiktionary projects. GerardM put them all on line in templates already. +The code is still very much alpha level and the scope of what it can do is +still rather limited, only 3 parts of speech, only 2 different Wiktionary +output formats, only langnames matrix for about 8 languages. One of the things +on the todo list is to harvest the content of this matrix dictionary from the +various Wiktionary projects. GerardM put them all on line in templates already. '''
import entry @@ -18,30 +24,33 @@ import meaning import term
+ class WiktionaryPage: """ This class contains all that can appear on one Wiktionary page """
- def __init__(self,wikilang,term): # wikilang here refers to the language of the Wiktionary this page belongs to + def __init__(self, wikilang, term): """ Constructor - Called with two parameters: - - the language of the Wiktionary the page belongs to - - the term that is described on this page + Called with two parameters: + - the language of the Wiktionary the page belongs to + - the term that is described on this page + """ - self.wikilang=wikilang - self.term=term - self.entries = {} # entries is a dictionary of entry objects indexed by entrylang + self.wikilang = wikilang + self.term = term + # entries is a dictionary of entry objects indexed by entrylang + self.entries = {} self.sortedentries = [] self.interwikilinks = [] self.categories = []
- def setWikilang(self,wikilang): + def setWikilang(self, wikilang): """ This method allows to switch the language on the fly """ - self.wikilang=wikilang + self.wikilang = wikilang
- def addEntry(self,entry): + def addEntry(self, entry): """ Add an entry object to this page object """ -# self.entries.setdefault(entry.entrylang, []).append(entry) - self.entries[entry.entrylang]=entry +## self.entries.setdefault(entry.entrylang, []).append(entry) + self.entries[entry.entrylang] = entry
def listEntries(self): """ Returns a dictionary of entry objects for this entry """ @@ -54,136 +63,146 @@
if not self.entries == {}: self.sortedentries = self.entries.keys() - self.sortedentries.sort(sortonlanguagename.sortonlanguagename(structs.langnames[self.wikilang])) + self.sortedentries.sort(sortonlanguagename.sortonlanguagename( + structs.langnames[self.wikilang]))
try: - samelangentrypos=self.sortedentries.index(self.wikilang) - except (ValueError): + samelangentrypos = self.sortedentries.index(self.wikilang) + except ValueError: # wikilang isn't in the list, do nothing pass else: - samelangentry=self.sortedentries[samelangentrypos] + samelangentry = self.sortedentries[samelangentrypos] self.sortedentries.remove(self.wikilang) - self.sortedentries.insert(0,samelangentry) + self.sortedentries.insert(0, samelangentry)
try: - translingualentrypos=self.sortedentries.index(u'translingual') - except (ValueError): + translingualentrypos = self.sortedentries.index(u'translingual') + except ValueError: # translingual isn't in the list, do nothing pass else: - translingualentry=self.sortedentries[translingualentrypos] + translingualentry = self.sortedentries[translingualentrypos] self.sortedentries.remove(u'translingual') - self.sortedentries.insert(0,translingualentry) + self.sortedentries.insert(0, translingualentry)
- def addLink(self,link): + def addLink(self, link): """ Add a link to another wikimedia project """ - link=link.replace('[','').replace(']','') - pos=link.find(':') - if pos!=1: - link=link[:pos] + link = link.replace('[', '').replace(']', '') + pos = link.find(':') + if pos != 1: + link = link[:pos] self.interwikilinks.append(link) - # print self.interwikilinks +## print self.interwikilinks
- def addCategory(self,category): + def addCategory(self, category): """ Add a link to another wikimedia project """ self.categories.append(category)
- def parseWikiPage(self,content): + def parseWikiPage(self, content): '''This function will parse the content of a Wiktionary page - and read it into our object structure. - It returns a list of dictionaries. Each dictionary contains a header object - and the textual content found under that header. Only relevant content is stored. - Empty lines and lines to create tables for presentation to the user are taken out.''' + and read it into our object structure. + It returns a list of dictionaries. Each dictionary contains a header + object and the textual content found under that header. Only relevant + content is stored. Empty lines and lines to create tables for + presentation to the user are taken out. + + '''
templist = [] context = {} aheader = '' - splitcontent=[] - content=content.split('\n') + splitcontent = [] + content = content.split('\n') for line in content: - # print line +## print line # Let's get rid of line breaks and extraneous white space - line=line.replace('\n','').strip() - # Let's start by looking for general stuff, that provides information which is - # interesting to store at the page level + line = line.replace('\n', '').strip() + # Let's start by looking for general stuff, that provides + # information which is interesting to store at the page level if '{wikipedia}' in line.lower(): self.addLink('wikipedia') continue if '[[category:' in line.lower(): - category=line.split(':')[1].replace(']','') + category = line.split(':')[1].replace(']', '') self.addCategory(category) -# print 'category: ', category +## print 'category: ', category continue if '|' not in line: - bracketspos=line.find('[[') - colonpos=line.find(':') - if bracketspos!=-1 and colonpos!=-1 and bracketspos < colonpos: + bracketspos = line.find('[[') + colonpos = line.find(':') + if bracketspos != -1 and colonpos != -1 and \ + bracketspos < colonpos: # This seems to be an interwikilink # If there is a pipe in it, it's not a simple interwikilink - linkparts=line.replace(']','').replace('[','').split(':') - lang=linkparts[0] - linkto=linkparts[1] - if len(lang)>1 and len(lang)<4: - self.addLink(lang+':'+linkto) + linkparts = line.replace(']', + '').replace('[', '').split(':') + lang = linkparts[0] + linkto = linkparts[1] + if len(lang) > 1 and len(lang) < 4: + self.addLink(lang + ':' + linkto) continue - # store empty lines literally, this is necessary for the blocks we don't parse - # and will return literally - if len(line) <2: + # store empty lines literally, this is necessary for the blocks we + # don't parse and will return literally + if len(line) < 2: templist.append(line) continue -# print 'line0:',line[0], 'line-2:',line[-2],'|','stripped line-2',line.rstrip()[-2] - if line.strip()[0]=='='and line.rstrip()[-2]=='=' or '{{-' in line and '-}}' in line: - # When a new header is encountered, it is necessary to store the information - # encountered under the previous header. +## print 'line0:', line[0], 'line-2:', line[-2],'|', +## print 'stripped line-2', print line.rstrip()[-2] + if line.strip()[0] == '=' and line.rstrip()[-2] == '=' or \ + '{{-' in line and '-}}' in line: + # When a new header is encountered, it is necessary to store + # the information encountered under the previous header. if templist and aheader: - tempdictstructure={'text': templist, - 'header': aheader, - 'context': copy.copy(context), - } - templist=[] + tempdictstructure = {'text': templist, + 'header': aheader, + 'context': copy.copy(context), + } + templist = [] splitcontent.append(tempdictstructure) -# print "splitcontent: ",splitcontent,"\n\n" - aheader=header.Header(line) -# print "Header parsed:",aheader.level, aheader.header, aheader.type, aheader.contents - if aheader.type==u'lang': - context['lang']=aheader.contents - if aheader.type==u'pos': +## print "splitcontent: ",splitcontent,"\n\n" + aheader = header.Header(line) +## print "Header parsed:", aheader.level, aheader.header, +## print aheader.type, aheader.contents + if aheader.type == u'lang': + context['lang'] = aheader.contents + if aheader.type == u'pos': if not 'lang' in context: # This entry lacks a language indicator, - # so we assume it is the same language as the Wiktionary we're working on - context['lang']=self.wikilang - context['pos']=aheader.contents + # so we assume it is the same language as the + # Wiktionary we're working on + context['lang'] = self.wikilang + context['pos'] = aheader.contents
else: # It's not a header line, so we add it to a temporary list # containing content lines - if aheader.contents==u'trans': - # Under the translations header there is quite a bit of stuff - # that's only needed for formatting, we can just skip that - # and go on processing the next line + if aheader.contents == u'trans': + # Under the translations header there is quite a bit of + # stuff that's only needed for formatting, we can just skip + # that and go on processing the next line lower = line.lower() - if '{top}' in lower: continue - if '{mid}' in lower: continue - if '{bottom}' in lower: continue - if '|-' in line: continue - if '{|' in line: continue - if '|}' in line: continue - if 'here-->' in lower: continue - if 'width=' in lower: continue - if '<!--left column' in lower: continue - if '<!--right column' in lower: continue + if ('{top}' in lower or + '{mid}' in lower or + '{bottom}' in lower or + '|-' in line or + '{|' in line or + '|}' in line or + 'here-->' in lower or + 'width=' in lower or + '<!--left column' in lower or + '<!--right column' in lower): + continue
templist.append(line)
# Let's not forget the last block that was encountered if templist: - tempdictstructure={'text': templist, - 'header': aheader, - 'context': copy.copy(context), - } + tempdictstructure = {'text': templist, + 'header': aheader, + 'context': copy.copy(context), + } splitcontent.append(tempdictstructure) -
# make sure variables are defined before they are used gender = sample = plural = diminutive = label = definition = '' @@ -191,127 +210,154 @@ diminutive = False examples = [] for contentblock in splitcontent: - headercontent=contentblock['header'].contents + headercontent = contentblock['header'].contents
-# print "contentblock:",contentblock -# print contentblock['header'] +## print "contentblock:",contentblock +## print contentblock['header'] # Now we parse the text blocks. - # Let's start by describing what to do with content found under the POS header - if contentblock['header'].type==u'pos': - flag=False + # Let's start by describing what to do with content found under + # the POS header + if contentblock['header'].type == u'pos': + flag = False for line in contentblock['text']: -# print line +## print line if line[:3] == "'''": # This seems to be an ''inflection line'' # It can be built up like this: '''sample''' - # Or more elaborately like this: '''staal''' ''n'' (Plural: [[stalen]], diminutive: [[staaltje]]) + # Or more elaborately like this: + # '''staal''' ''n'' (Plural: [[stalen]], + # diminutive: [[staaltje]]) # Or like this: {{en-infl-reg-other-e|ic|e}} # Let's first get rid of parentheses and brackets: - line=line.replace('(','').replace(')','').replace('[','').replace(']','') + line = line.replace('(', '').replace(')', '').replace( + '[', '').replace(']', '') # Then we can split it on the spaces for part in line.split(' '): -# print part[:3], "Flag:", flag - if flag==False and part[:3] == "'''": - sample=part.replace("'",'').strip() -# print 'Sample:', sample - # OK, so this should be an example of the term we are describing - # maybe it is necessary to compare it to the title of the page +## print part[:3], "Flag:", flag + if not flag and part[:3] == "'''": + sample = part.replace("'", '').strip() +## print 'Sample:', sample + # OK, so this should be an example of the term + # we are describing. Maybe it is necessary to + # compare it to the title of the page if sample: for subpart in line.split(' '): - maybegender=part.replace("'",'').replace("}",'').replace("{",'').lower() - if maybegender=='m': - gender='m' - if maybegender=='f': - gender='f' - if maybegender=='n': - gender='n' - if maybegender=='c': - gender='c' - if maybegender[:1]=='p': - number=2 - if maybegender[:3]=='dim': - diminutive=True -# print 'Gender: ',gender - if part.replace("'",'')[:2].lower()=='pl': - flag='plural' - if part.replace("'",'')[:3].lower()=='dim': - flag='diminutive' - if flag=='plural': - plural=part.replace(',','').replace("'",'').strip() -# print 'Plural: ',plural - if flag=='diminutive': - diminutive=part.replace(',','').replace("'",'').strip() -# print 'Diminutive: ',diminutive + maybegender = part.replace( + "'", '').replace("}", '').replace( + "{", '').lower() + if maybegender == 'm': + gender = 'm' + if maybegender == 'f': + gender = 'f' + if maybegender == 'n': + gender = 'n' + if maybegender == 'c': + gender = 'c' + if maybegender[:1] == 'p': + number = 2 + if maybegender[:3] == 'dim': + diminutive = True +## print 'Gender: ',gender + if part.replace("'", '')[:2].lower() == 'pl': + flag = 'plural' + if part.replace("'", '')[:3].lower() == 'dim': + flag = 'diminutive' + if flag == 'plural': + plural = part.replace(',', '').replace( + "'", '').strip() +## print 'Plural: ',plural + if flag == 'diminutive': + diminutive = part.replace( + ',', '').replace("'", '').strip() +## print 'Diminutive: ',diminutive if line[:2] == "{{": # Let's get rid of accolades: - line=line.replace('{','').replace('}','') + line = line.replace('{', '').replace('}', '') # Then we can split it on the dashes - parts=line.split('-') - lang=parts[0] - what=parts[1] - mode=parts[2] - other=parts[3] - infl=parts[4].split('|') + parts = line.split('-') + lang = parts[0] + what = parts[1] + mode = parts[2] + other = parts[3] + infl = parts[4].split('|') if sample: # We can create a Term object # TODO which term object depends on the POS -# print "contentblock['context'].['lang']", contentblock['context']['lang'] - if headercontent=='noun': - theterm=term.Noun(lang=contentblock['context']['lang'], term=sample, gender=gender, number=number, diminutive=diminutive) - if headercontent=='verb': - theterm=term.Verb(lang=contentblock['context']['lang'], term=sample) - sample='' -# raw_input("") +## print "contentblock['context'].['lang']", +## print contentblock['context']['lang'] + if headercontent == 'noun': + theterm = term.Noun( + lang=contentblock['context']['lang'], + term=sample, gender=gender, number=number, + diminutive=diminutive) + if headercontent == 'verb': + theterm = term.Verb( + lang=contentblock['context']['lang'], + term=sample) + sample = '' +## raw_input("") if line[:1].isdigit(): - # Somebody didn't like automatic numbering and added static numbers - # of their own. Let's get rid of them + # Somebody didn't like automatic numbering and added + # static numbers of their own. Let's get rid of them while line[:1].isdigit(): - line=line[1:] - # and replace them with a hash, so the following if block picks it up + line = line[1:] + # and replace them with a hash, so the following if + # block picks it up line = '#' + line if line[:1] == "#": # This probably is a definition - # If we already had a definition we need to store that one's data - # in a Meaning object and make that Meaning object part of the Page object + # If we already had a definition we need to store that + # one's data in a Meaning object and make that Meaning + # object part of the Page object if definition: - ameaning = meaning.Meaning(term=theterm,definition=definition, label=label, examples=examples) + ameaning = meaning.Meaning(term=theterm, + definition=definition, + label=label, + examples=examples)
# sample # plural and diminutive belong with the Noun object - # comparative and superlative belong with the Adjective object - # conjugations belong with the Verb object + # comparative and superlative belong with the + # Adjective object conjugations belong with the + # Verb object
# Reset everything for the next round - sample = plural = diminutive = label = definition = '' + sample = plural = diminutive = label = '' + definition = '' examples = []
- if not contentblock['context']['lang'] in self.entries: - # If no entry for this language has been foreseen yet - # let's create one - anentry = entry.Entry(contentblock['context']['lang']) + if not contentblock[ + 'context']['lang'] in self.entries: + # If no entry for this language has been + # foreseen yet. Let's create one + anentry = entry.Entry( + contentblock['context']['lang']) # and add it to our page object self.addEntry(anentry) # Then we can easily add this meaning to it. anentry.addMeaning(ameaning)
- pos=line.find('<!--') - if pos!=-1 and pos < 4: - # A html comment at the beginning of the line means this entry already has disambiguation labels, great - pos2=line.find('-->') - label=line[pos+4:pos2] - definition=line[pos2+1:] -# print 'label:',label + pos = line.find('<!--') + if pos != -1 and pos < 4: + # A html comment at the beginning of the line + # means this entry already has disambiguation + # labels, great + pos2 = line.find('-->') + label = line[pos + 4:pos2] + definition = line[pos2 + 1:] +## print 'label:',label else: - definition=line[1:].strip() -# print "Definition: ", definition + definition = line[1:].strip() +## print "Definition: ", definition if line[:2] == "#:": # This is an example for the preceding definition - example=line[2:] -# print "Example:", example + example = line[2:] +## print "Example:", example examples.add(example) # Make sure we store the last definition if definition: - ameaning = meaning.Meaning(term=theterm, definition=definition, label=label, examples=examples) + ameaning = meaning.Meaning(term=theterm, definition=definition, + label=label, examples=examples) if not contentblock['context']['lang'] in self.entries: # If no entry for this language has been foreseen yet # let's create one @@ -321,22 +367,32 @@ # Then we can easily add this meaning to it. anentry.addMeaning(ameaning)
- winner = False # This is going to contain the Meaning object which has the Definition which matches the Concisedef of the entry we are working on right now - if headercontent=='trans' or headercontent=='syn' or headercontent=='ant': - # On the English Wiktionary we will find concisedefs here to link definitions to the content of these sections, but only if there is more than one definition. - print "number of meanings:",len(anentry.meanings.keys()) - concisedefclean='' + # This is going to contain the Meaning object which has the + # Definition which matches the Concisedef of the entry we are + # working on right now: + winner = False + if headercontent == 'trans' or headercontent == 'syn' or \ + headercontent == 'ant': + # On the English Wiktionary we will find concisedefs here to + # link definitions to the content of these sections, but only + # if there is more than one definition. + print "number of meanings:", len(anentry.meanings.keys()) + concisedefclean = '' for line in contentblock['text']: if line[:3] == "'''": # This seems to be a line containing a concisedef - concisedef=line.replace("'''",'').strip() - concisedefclean=concisedef.replace("(",'').replace(")",'').replace("'",'').replace(":",'').replace(".",'').lower() + concisedef = line.replace("'''", '').strip() + concisedefclean = concisedef.replace( + "(", '').replace(")", '').replace("'", '').replace( + ":", '').replace(".", '').lower() if line[:2] == "*(": # This seems to be a line containing a concisedef - pos=line.find(')') - concisedef=line[2:pos].strip() - concisedefclean=concisedef.replace("(",'').replace(")",'').replace("'",'').replace(":",'').replace(".",'').lower() - restofline=line[pos+2:].strip() + pos = line.find(')') + concisedef = line[2:pos].strip() + concisedefclean = concisedef.replace("(", '').replace( + ")", '').replace("'", '').replace(":", '').replace( + ".", '').lower() + restofline = line[pos + 2:].strip() # Now we have this concisedef, it's worthless if it can't # be matched to a definition in order to know to what # meaning the following content belongs to @@ -344,39 +400,52 @@ # Let's start by creating a list of meanings for the entry # we're working on if concisedefclean: - highest=0 - winner=anentry.meanings[contentblock['context']['pos']][0] - for anothermeaning in anentry.meanings[contentblock['context']['pos']]: - score=0 + highest = 0 + winner = anentry.meanings[ + contentblock['context']['pos']][0] + for anothermeaning in anentry.meanings[ + contentblock['context']['pos']]: + score = 0 for word in concisedefclean.split(): - definition=anothermeaning.definition.replace("(",'').replace(")",'').replace("'",'').replace(":",'').replace(".",'').replace("#",'').lower() - if len(word)>1 and ' '+word+' ' in definition: - score+=1 - if len(word)>2 and word in definition: - score+=1 - if score>highest: - highest=score - winner=anothermeaning -# print 'winner:',winner.definition, 'score:',highest + definition = anothermeaning.definition.replace( + "(", '').replace(")", '').replace( + "'", '').replace(":", '').replace( + ".", '').replace("#", '').lower() + if len(word) > 1 and \ + ' %s ' % word in definition: + score += 1 + if len(word) > 2 and word in definition: + score += 1 + if score > highest: + highest = score + winner = anothermeaning +## print 'winner:', winner.definition, 'score:', highest winner.setConciseDef(concisedef) - if headercontent=='trans': - """ - We have to find a way to read the rest of the lines until the next ConciseDef into a structure, that can be processed later on. In contrast to a list of synonyms where the synonyms are on the rest of the lines, translations are on the following lines. - It's also possible that there is no concisedef and that the translation's block simpy starts... or that there are numbers instead of concisedefs. - """ + if headercontent == 'trans': + # We have to find a way to read the rest of the + # lines until the next ConciseDef into a structure, + # that can be processed later on. In contrast to a + # list of synonyms where the synonyms are on the + # rest of the lines, translations are on the + # following lines. It's also possible that there + # is no concisedef and that the translation's block + # simpy starts... or that there are numbers instead + # of concisedefs. + pass
- if headercontent=='syn': -# print 'syn',restofline + if headercontent == 'syn': +## print 'syn', restofline winner.parseSynonyms(restofline) - if headercontent=='trans': -# print 'trans',restofline + if headercontent == 'trans': +## print 'trans', restofline winner.parseTranslations(line)
# raw_input("")
def wikiWrap(self): """ Returns a string that is ready to be submitted to Wiktionary for - this page + this page + """ page = '' self.sortEntries() @@ -387,21 +456,22 @@ print "Entries:", self.entries[index] entry = self.entries[index] print entry - if first == False: - page = page + '\n----\n' + if not first: + page += '\n----\n' else: first = False page = page + entry.wikiWrap(self.wikilang) # Add interwiktionary links at bottom of page for link in self.interwikilinks: - page = page + '[' + link + ':' + self.term + ']\n' + page += '[' + link + ':' + self.term + ']\n'
return page
def showContents(self): """ Prints the contents of all the subobjects contained in this page. - Every subobject is indented a little further on the screen. - The primary purpose is to help keep one's sanity while debugging. + Every subobject is indented a little further on the screen. + The primary purpose is to help keep one's sanity while debugging. + """ indentation = 0 print ' ' * indentation + 'wikilang = %s' % self.wikilang @@ -411,15 +481,11 @@ entrieskeys = self.entries.keys() for entrieskey in entrieskeys: for entry in self.entries[entrieskey]: - entry.showContents(indentation+2) + entry.showContents(indentation + 2)
if __name__ == '__main__': - temp() - - ofn = 'wiktionaryentry.txt' content = open(ofn).readlines() - - apage = WiktionaryPage(wikilang,pagetopic) + apage = WiktionaryPage(wikilang, pagetopic) apage.parseWikiPage(content) diff --git a/wiktionary/wiktionarypagetest.py b/wiktionary/wiktionarypagetest.py index 2e6566b..1c65466 100644 --- a/wiktionary/wiktionarypagetest.py +++ b/wiktionary/wiktionarypagetest.py @@ -45,9 +45,9 @@
""" knownvalues = ( -{'wikilang': 'en', - 'term': 'nut', - 'wikiformat': u"""==English== + {'wikilang': 'en', + 'term': 'nut', + 'wikiformat': u"""==English== ===Etymology=== From Middle English [[nute]], from Old English [[hnutu]]. <!-- Is Latin [[nux]], nuc- a cognate? --> ===Pronunciation=== @@ -179,210 +179,242 @@ [[Category:Trees]] [[category:Foods]] """, - 'internalrep': - ( - [u'1000 English basic words', u'Colors', u'Browns', u'Trees', u'Foods'], - [u'io', 'la'], - {u'en': - [u'nut', None, u'nuts', - [{'definition': u'A hard-shelled seed.', - 'concisedef': u'seed', + 'internalrep': + ( + [u'1000 English basic words', u'Colors', u'Browns', u'Trees', + u'Foods'], + [u'io', 'la'], + {u'en': + [u'nut', None, u'nuts', + [{'definition': u'A hard-shelled seed.', + 'concisedef': u'seed', + 'trans': {'remark': '', + 'alltrans': { + 'nl': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"noot", 'f', 1)} + ] + }, + 'de': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"Nuss", 'f', 1)} + ] + }, + 'it': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"noce", 'f', 1)} + ] + }, + 'la': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"nux", '', 1)} + ] + }, + } + } + }, + {'definition': + u"A piece of metal, often [[hexagonal]], with a hole through it with internal threading intended to fit on to a bolt.", + 'concisedef': u'that fits on a bolt', + 'trans': {'remark': '', + 'alltrans': { + 'nl': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"moer", 'f', 1)} + ] + }, + 'fr': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"écrou", 'm', 1)} + ] + }, + 'de': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"Mutter", 'f', 1)} + ] + }, + 'it': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"dado", 'm', 1)} + ] + } + } + } + }, + {'definition': u"(''informal'') An insane person.", + 'concisedef': u"informal: insane person", + 'syns': {'remark': '', + 'synonyms': [{'remark': '', + 'synonym': u"loony"}, + {'remark': '', + 'synonym': u"nutcase"}, + {'remark': '', + 'synonym': u"nutter"} + ] + }, 'trans': {'remark': '', 'alltrans': { - 'nl': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"noot", 'f', 1)} - ] - }, -# 'fr': u"""''no generic translation exists''; [[noix]] ''f'' ''is often used, but this actually means "[[walnut]]"''""", - 'de': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"Nuss", 'f', 1)} - ] - }, - 'it': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"noce", 'f', 1)} - ] - }, - 'la': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"nux", '', 1)} - ] - }, - } - } - }, - {'definition': u"A piece of metal, often [[hexagonal]], with a hole through it with internal threading intended to fit on to a bolt.", - 'concisedef': u'that fits on a bolt', - 'trans': {'remark': '', - 'alltrans': { - 'nl': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"moer", 'f', 1)} - ] - }, - 'fr': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"écrou", 'm', 1)} - ] - }, - 'de': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"Mutter", 'f', 1)} - ] - }, - 'it': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"dado", 'm', 1)} - ] - } - } - } - }, - {'definition': u"(''informal'') An insane person.", - 'concisedef': u"informal: insane person", - 'syns': {'remark': '', - 'synonyms': [{'remark': '', - 'synonym': u"loony"}, - {'remark': '', - 'synonym': u"nutcase"}, - {'remark': '', - 'synonym': u"nutter"} - ] - }, - 'trans': {'remark': '', - 'alltrans': { - 'nl': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"gek", 'm', 1)}, - {'remark': '', - 'translation': (u"gekkin", 'f', 1)}, - {'remark': '', - 'translation': (u"zot", 'm', 1)}, - {'remark': '', - 'translation': (u"zottin", 'f', 1)} - ] - }, - 'fr': {'remark': '', - 'translations': [{'remark': '', - 'translation': ("fou", 'm', 1)}, - {'remark': '', - 'translation': ("folle", 'f', 1)} - ] - }, - 'de': {'remark': '', - 'translations': [{'remark': '', - 'translation': ("Irre", 'mf', 1)}, - {'remark': '', - 'translation': ("Irrer", 'm indef.', 1)} - ] - } - } - } - }, - {'definition': u"(''slang'') The head.", - 'concisedef': u"slang: the head", - 'syns': {'remark': '(See further synonyms under [[head]])', - 'synonyms': [{'remark': '', - 'synonym': u"bonce"}, - {'remark': '', - 'synonym': u"noddle"}]}, - 'trans': {'remark': '', - 'alltrans': { - 'de': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"Birne", 'f', 1)}, - {'remark': '', - 'translation': ("Rübe", 'f', 1)}, - {'remark': '', - 'translation': ("Dötz", 'm', 1)} - ] - } - } + 'nl': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"gek", 'm', 1)}, + {'remark': '', + 'translation': + (u"gekkin", 'f', 1)}, + {'remark': '', + 'translation': + (u"zot", 'm', 1)}, + {'remark': '', + 'translation': + (u"zottin", 'f', 1)} + ] + }, + 'fr': {'remark': '', + 'translations': [{'remark': '', + 'translation': ("fou", 'm', 1)}, + {'remark': '', + 'translation': + ("folle", 'f', 1)} + ] + }, + 'de': {'remark': '', + 'translations': [{'remark': '', + 'translation': + ("Irre", 'mf', 1)}, + {'remark': '', + 'translation': + ("Irrer", 'm indef.', 1)} + ] + } } - }, - {'definition': u"(''slang; rarely used in the singular'') A testicle.", - 'concisedef': u"slang: testicle", - 'syns': {'remark': '', - 'synonyms': [{'remark': '', - 'synonym': u"ball"}, - {'remark': "(''taboo slang'')", - 'synonym': u"bollock"}, - {'remark': '', - 'synonym': u"nad"}]}, - 'trans': {'remark': '', - 'alltrans': {'nl': {'remark': '<!--Never heard this before-->', - 'translations': [{'remark': '', - 'translation': (u"noten", 'm', 2)}, - {'remark': '', - 'translation': ("bal", 'm', 1)}, - {'remark': '', - 'translation': ("teelbal", 'm', 1)} - ] - }, - 'fr': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"couille", 'f', 1)} - ] - }, - 'de': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"Ei", 'n', 1)}, - {'remark': u"''lately:''", - 'translation': (u"Nuss", 'f', 1)} - ] - }, - 'es': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"cojone", '', 1)}, - {'remark': '', - 'translation': (u"huevo", '', 1)} - ] - } - } - }, - } - ], - ], - u'nl': - [u'nut', 'n', None, - [{'definition': u'[[use]], [[benefit]]', 'concisedef': u''}] - ], - } - ) - }, -{'wikilang': 'nl', - 'term': 'dummy', - 'wikiformat': u""" + } + }, + {'definition': u"(''slang'') The head.", + 'concisedef': u"slang: the head", + 'syns': {'remark': '(See further synonyms under [[head]])', + 'synonyms': [{'remark': '', + 'synonym': u"bonce"}, + {'remark': '', + 'synonym': u"noddle"}]}, + 'trans': {'remark': '', + 'alltrans': { + 'de': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"Birne", 'f', 1)}, + {'remark': '', + 'translation': + ("Rübe", 'f', 1)}, + {'remark': '', + 'translation': + ("Dötz", 'm', 1)} + ] + } + } + } + }, + {'definition': + u"(''slang; rarely used in the singular'') A testicle.", + 'concisedef': u"slang: testicle", + 'syns': {'remark': '', + 'synonyms': [{'remark': '', + 'synonym': u"ball"}, + {'remark': "(''taboo slang'')", + 'synonym': u"bollock"}, + {'remark': '', + 'synonym': u"nad"}]}, + 'trans': {'remark': '', + 'alltrans': { + 'nl': {'remark': + '<!--Never heard this before-->', + 'translations': [{'remark': '', + 'translation': + (u"noten", 'm', 2)}, + {'remark': '', + 'translation': + ("bal", 'm', 1)}, + {'remark': '', + 'translation': + ("teelbal", 'm', 1)} + ] + }, + 'fr': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"couille", 'f', 1)} + ] + }, + 'de': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"Ei", 'n', 1)}, + {'remark': + u"''lately:''", + 'translation': + (u"Nuss", 'f', 1)} + ] + }, + 'es': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"cojone", '', 1)}, + {'remark': '', + 'translation': + (u"huevo", '', 1)} + ] + } + } + }, + } + ], + ], + u'nl': + [u'nut', 'n', None, + [{'definition': u'[[use]], [[benefit]]', 'concisedef': u''}] + ], + } + ) + }, + {'wikilang': 'nl', + 'term': 'dummy', + 'wikiformat': u""" {{-nl-}} {{-noun-}} '''dummy''' {{m}} """, - 'internalrep': - ( - [u''], - [u''], - {u'nl': - [u'dummy', 'm', u"dummy's", - [{'definition': u'', - 'concisedef': u'', - 'trans': {'remark': '', - 'alltrans': { - 'nl': {'remark': '', - 'translations': [{'remark': '', - 'translation': (u"", '', 1)} - ] - }, - } - } + 'internalrep': + ( + [u''], + [u''], + {u'nl': + [u'dummy', 'm', u"dummy's", + [{'definition': u'', + 'concisedef': u'', + 'trans': {'remark': '', + 'alltrans': { + 'nl': {'remark': '', + 'translations': [{'remark': '', + 'translation': + (u"", '', 1)} + ] + }, + } + } + } + ], + ], + } + ) } - ], - ], - } ) - } - ) # def testWhetherCategoriesAreParsedProperly(self): # """Test whether Categories are parsed properly""" # for value in self.knownvalues: @@ -467,6 +499,7 @@ # if concisedef!='' and refsyns.has_key(concisedef) and resultsyns.has_key(concisedef): # self.assertEqual(resultsyns[concisedef], refsyns[concisedef]) # + def testWhetherTranslationsAreParsedProperly(self): """Test whether translations are parsed properly""" for value in self.knownvalues: @@ -475,8 +508,8 @@ value['term']) apage.parseWikiPage(value['wikiformat']) for entrylang in internalrepresentation.keys(): - definitions=internalrepresentation[entrylang][3] - reftrans={} + definitions = internalrepresentation[entrylang][3] + reftrans = {} for definition in definitions: if 'trans' in definition and definition['trans']: reftrans[definition['concisedef']] = definition['trans'] @@ -487,7 +520,8 @@ for resultmeaning in apage.entries[entrylang].meanings[key]: print resultmeaning.concisedef print 'Translations: ', resultmeaning.getTranslations() - resulttrans[resultmeaning.concisedef] = resultmeaning.getTranslations() + resulttrans[ + resultmeaning.concisedef] = resultmeaning.getTranslations()
for concisedef in resulttrans.keys(): if concisedef != '' and concisedef in reftrans and \
pywikibot-commits@lists.wikimedia.org