http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9873
Revision: 9873 Author: binbot Date: 2012-02-07 20:40:12 +0000 (Tue, 07 Feb 2012) Log Message: ----------- Rollback to previous, sorry about that (capitalize will decapitalize letters other than the first)
Modified Paths: -------------- trunk/pywikipedia/solve_disambiguation.py
Modified: trunk/pywikipedia/solve_disambiguation.py =================================================================== --- trunk/pywikipedia/solve_disambiguation.py 2012-02-07 13:19:24 UTC (rev 9872) +++ trunk/pywikipedia/solve_disambiguation.py 2012-02-07 20:40:12 UTC (rev 9873) @@ -73,7 +73,7 @@ # (C) Daniel Herding, 2004 # (C) Andre Engels, 2003-2004 # (C) WikiWichtel, 2004 -# (C) Pywikipedia team, 2003-2012 +# (C) Pywikipedia team, 2003-2009 # __version__='$Id$' # @@ -351,6 +351,9 @@ }, }
+def firstcap(string): + return string[0].upper()+string[1:] + def correctcap(link, text): # If text links to a page with title link uncapitalized, uncapitalize link, # otherwise capitalize it @@ -361,6 +364,21 @@ else: return linkupper
+def firstlinks(page): + #Returns a list of first links of every line beginning with * + #When a disambpage is full of unnecessary links, this may be useful + #to sort out the relevant links. E.g. from line + #*[[Jim Smith (smith)|Jim Smith]] ([[1832]]-[[1932]]) [[English]] [[smith]] + #it returns only Jim Smith (smith) + #No check for page existence, it has already been done. + list = [] + reg = re.compile(r'*.*?[[(.*?)(||]])') + for line in page.get().splitlines(): + found = reg.match(line) + if found: + list.append(found.group(1)) + return list + class ReferringPageGeneratorWithIgnore: def __init__(self, disambPage, primary=False, minimum = 0): self.disambPage = disambPage @@ -520,6 +538,20 @@ # note that the definition of 'letter' varies from language to language. self.linkR = re.compile(r'[[(?P<title>[^]|#]*)(?P<section>#[^]|]*)?(|(?P<label>[^]]*))?]](?P<linktrail>' + linktrail + ')')
+ def firstize(self, page, links): + #duma + #check param + titles = [t.capitalize() for t in firstlinks(page)] + pywikibot.output('\t'.join(titles)) + print len (titles), len(links) + pywikibot.output('\t'.join(l.title() for l in links)) + for l in links[:]: + pywikibot.output(l.title()) + if l.title() not in titles: + links.remove(l) + print 'meghalt' + return links + def treat(self, refPage, disambPage): """ Parameters: @@ -789,8 +821,8 @@ # check if we can create a link with trailing characters # instead of a pipelink elif len(new_page_title) <= len(link_text) \ - and link_text[:len(new_page_title)].capitalize() \ - == new_page_title.capitalize() \ + and firstcap(link_text[:len(new_page_title)]) \ + == firstcap(new_page_title) \ and re.sub(self.trailR, '', link_text[len(new_page_title):]) == '' \ and not section: @@ -871,6 +903,7 @@ primary_topic_format[self.mylang] % disambPage.title()) links = disambPage2.linkedPages() + links = self.firstize(disambPage2, links) links = [correctcap(l, disambPage2.get()) for l in links] except pywikibot.NoPage:
pywikipedia-svn@lists.wikimedia.org