Revision: 7920 Author: xqt Date: 2010-02-09 14:34:01 +0000 (Tue, 09 Feb 2010)
Log Message: ----------- family/wikipedia: read redirect tags via API (dict removed) cc: add fixArabicLetters solve_disambiguation: update exception list pywikibot: bugfixes
Modified Paths: -------------- trunk/pywikipedia/cosmetic_changes.py trunk/pywikipedia/family.py trunk/pywikipedia/pywikibot/__init__.py trunk/pywikipedia/pywikibot/textlib.py trunk/pywikipedia/solve_disambiguation.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/cosmetic_changes.py =================================================================== --- trunk/pywikipedia/cosmetic_changes.py 2010-02-08 15:37:02 UTC (rev 7919) +++ trunk/pywikipedia/cosmetic_changes.py 2010-02-09 14:34:01 UTC (rev 7920) @@ -275,6 +275,7 @@ text = self.fixHtml(text) text = self.fixStyle(text) text = self.fixTypo(text) + text = self.fixArabicLetters(text) try: text = isbn.hyphenateIsbnNumbers(text) except isbn.InvalidIsbnException, error: @@ -679,6 +680,40 @@ text = pywikibot.replaceExcept(text, ur'º([CF])', ur'°\1', exceptions) return text
+ def fixArabicLetters(self, text): + if self.site.lang=='ckb': + exceptions = [ + 'gallery', + 'hyperlink', + 'interwiki', + 'link', + 'math', + 'pre', + 'template', + 'timeline', + 'ref', + 'source', + 'startspace', + ] + text = pywikibot.replaceExcept(text, u',', u'،', exceptions) + text = pywikibot.replaceExcept(text, ur'ه([.، ])', ur'ە\1', exceptions) + text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) + text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) + text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) + text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions) + # replace persian digits + for i in range(0,10): + text = pywikibot.replaceExcept(text, u'۰۱۲۳۴۵۶۷۸۹'[i], u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) + # do not change digits in class, style and table params + pattern = re.compile(u'=".*?"', re.UNICODE) + exceptions.append(pattern) + # do not change digits inside html-tags + pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) + exceptions.append(pattern) + for i in range(0,10): + text = pywikibot.replaceExcept(text, str(i), u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) + return text + class CosmeticChangesBot: def __init__(self, generator, acceptall = False, comment=u'Robot: Cosmetic changes'): self.generator = generator
Modified: trunk/pywikipedia/family.py =================================================================== --- trunk/pywikipedia/family.py 2010-02-08 15:37:02 UTC (rev 7919) +++ trunk/pywikipedia/family.py 2010-02-09 14:34:01 UTC (rev 7920) @@ -3449,178 +3449,6 @@ def category_namespaces(self, code): return self.namespace(code, 14, all = True)
- # Localised magic words for language code 'xyz' can be found in - # the MediaWiki source code in the file - # /mediawiki/trunk/phase3/languages/messages/MessagesXyz.php - # in the 'magicwords' array - - # Localised redirect codes - - # Note that redirect codes are case-insensitive, so it is enough - # to enter the code in lowercase here. - - # When creating a redirect page, only the first item is looked for. - # When matching for redirects, default 'redirect' is always inserted - # => if default redirect keyword used for a language is not 'redirect', - # it is not necessary to add 'redirect' at the end of the list - redirect = { - 'ab': [u'перенаправление', u'перенапр', u'redirect'], - 'ace': [u'alih'], - 'af': [u'aanstuur'], - 'aln': [u'ridrejto'], - 'als': [u'weiterleitung'], - 'an': [u'redirección'], - 'ar': [u'تحويل'], - 'arn': [u'redirección'], - 'arz': [u'تحويل'], - 'av': [u'перенаправление', u'перенапр'], - 'ay': [u'redirección'], - 'ba': [u'перенаправление', u'перенапр'], - 'bar': [u'weiterleitung'], - 'bat-smg': [u'peradresavimas'], - 'bcc': [u'تغییرمسیر'], - 'be-tarask': [u'перанакіраваньне'], - 'be-x-old': [u'перанакіраваньне'], - 'bg': [u'виж', u'пренасочване'], - 'bm': [u'redirection'], - 'bqi': [u'تغییرمسیر'], - 'br': [u'adkas'], - 'bug': [u'alih'], - 'bs': [u'preusmjeri'], - 'cbk-zam': [u'redirección'], - 'ce': [u'перенаправление', u'перенапр'], - 'cs': [u'přesměruj'], - 'cu': [u'прѣнаправлєниѥ'], - 'cv': [u'перенаправление', u'перенапр'], - 'cy': [u'ail-cyfeirio', u'ailgyfeirio'], - 'de': [u'weiterleitung'], - 'de-at': [u'weiterleitung'], - 'de-ch': [u'weiterleitung'], - 'de-formal': [u'weiterleitung'], - 'dsb': [u'weiterleitung'], - 'el': [u'ανακατευθυνση'], - 'eml': [u'rinvia', u'rinvio'], - 'eo': [u'alidirektu'], - 'es': [u'redirección'], - 'et': [u'suuna'], - 'eu': [u'birzuzendu'], - 'fa': [u'تغییرمسیر'], - 'ff': [u'redirection'], - 'fi': [u'ohjaus', u'uudelleenohjaus'], - 'fiu-vro': [u'saadaq'], - 'fr': [u'redirection'], - 'frp': [u'redirèccion', u'redirection'], - 'fur': [u'rinvia', u'rinvio'], - 'ga': [u'athsheoladh'], - 'gag': [u'yönlendirme'], - 'gl': [u'redirección'], - 'glk': [u'تغییرمسیر'], - 'gn': [u'redirección'], - 'gsw': [u'weiterleitung'], - 'he': [u'הפניה'], - 'hr': [u'preusmjeri'], - 'hsb': [u'weiterleitung'], - 'ht': [u'redirection'], - 'hu': [u'átirányítás'], - 'hy': [u'վերահղում'], - 'id': [u'alih'], - 'inh': [u'перенаправление', u'перенапр'], - 'is': [u'tilvísun'], - 'it': [u'rinvia', u'rinvio'], - 'ja': [u'転送', u'リダイレクト'], - 'jv': [u'alih'], - 'ka': [u'გადამისამართება'], - 'kaa': [u'aýdaw', u'айдау'], - 'kk': [u'айдау'], - 'kk-arab': [u'ايداۋ'], - 'kk-cyrl': [u'АЙДАУ'], - 'kk-latn': [u'aýdaw', u'айдау'], - 'km': [u'\u1794\u1789\u17d2\u1787\u17bc\u1793\u1794\u1793\u17d2\u178f', - u'\u1794\u17d2\u178f\u17bc\u179a\u1791\u17b8\u178f\u17b6\u17c6\u1784', - u'\u1794\u17d2\u178a\u17bc\u179a\u1785\u17c6\u178e\u1784\u1787\u17be\u1784', - u'ប្តូរទីតាំងទៅ'], - 'ko': [u'넘겨주기'], - 'ksh': [u'ömleide op', u'ömleidung'], - 'kv': [u'перенаправление', u'перенапр'], - 'lad': [u'redirección'], - 'lb': [u'weiterleitung'], - 'lbe': [u'перенаправление', u'перенапр'], - 'li': [u'doorverwijzing'], - 'lij': [u'rinvia', u'rinvio'], - 'lld': [u'rinvia', u'rinvio'], - 'lmo': [u'rinvia', u'rinvio'], - 'ln': [u'redirection'], - 'lt': [u'peradresavimas'], - 'map-bms': [u'alih'], - 'mg': [u'redirection'], - 'mhr': [u'перенаправление', u'перенапр'], - 'mk': [u'пренасочување', u'види'], - 'ml': [u'തിരിച്ചുവിടുക', u'തിരിച്ചുവിടല്'], - 'mo': [u'redirecteaza'], - 'mr': [u'पुनर्निर्देशन'], - 'mt': [u'rindirizza'], - 'mwl': [u'ancaminar'], - 'myv': [u'перенаправление', u'перенапр'], - 'mzn': [u'تغییرمسیر'], - 'nah': [u'redirección'], - 'nap': [u'rinvia'], - 'nds': [u'wiederleiden', u'weiterleitung'], - 'nds-nl': [u'deurverwiezing', u'doorverwijzing'], - 'new': [u'पुनर्निर्देश'], - 'nl': [u'doorverwijzing'], - 'nn': [u'omdiriger'], - 'no': [u'omdirigering'], - 'oc': [u'redireccion'], - 'os': [u'рарвыст', u'перенаправление', u'перенапр'], - 'pdc': [u'weiterleitung'], - 'pl': [u'patrz', u'przekieruj', u'tam'], - 'pms': [u'rinvia', u'rinvio'], - 'pt': [u'redirecionamento'], - 'pt-br': [u'redirecionamento'], - 'qu': [u'pusapuna', u'redirección'], - 'rmy': [u'redirecteaza'], - 'ro': [u'redirecteaza'], - 'ru': [u'перенаправление', u'перенапр'], - 'sa': [u'पुनर्निदेशन'], - 'sah': [u'перенаправление', u'перенапр'], - 'scn': [u'rinvia', u'rinvio'], - 'sd': [u'چوريو'], - 'sg': [u'redirection'], - 'shi': [u'تحويل'], - 'si': [u'යළියොමුව'], - 'sk': [u'presmeruj'], - 'sl': [u'preusmeritev'], - 'sli': [u'weiterleitung'], - 'sq': [u'ridrejto'], - 'sr': [u'преусмери', u'преусмери'], - 'sr-ec': [u'преусмери'], - 'sr-el': [u'preusmeri'], - 'srn': [u'stir', u'doorverwijzing'], - 'stq': [u'weiterleitung'], - 'su': [u'alih'], - 'sv': [u'omdirigering'], - 'szl': [u'patrz', u'przekieruj', u'tam'], - 'ta': [u'வழிமாற்று'], - 'te': [u'దారిమార్పు'], - 'th': [u'เปลี่ยนทาง'], - 'tr': [u'yönlendirme'], - 'tt': [u'yünältü'], - 'tt-latn': [u'yünältü'], - 'tt-cyrl': [u'перенаправление', u'перенапр'], - 'ty': [u'redirection'], - 'udm': [u'перенаправление', u'перенапр'], - 'uk': [u'перенаправлення', u'перенаправление', u'перенапр'], - 'vec': [u'rinvia', u'rinvio'], - 'vep': [u'suuna'], - 'vi': [u'đổi', u'đổi'], - 'vls': [u'doorverwijzing'], - 'vro': [u'saadaq', u'suuna'], - 'wa': [u'redirection'], - 'wo': [u'redirection'], - 'yi': [u'ווייטערפירן', u'הפניה'], - 'zea': [u'doorverwijzing'] - } - # So can be pagename code pagename = { 'bg': [u'СТРАНИЦА'],
Modified: trunk/pywikipedia/pywikibot/__init__.py =================================================================== --- trunk/pywikipedia/pywikibot/__init__.py 2010-02-08 15:37:02 UTC (rev 7919) +++ trunk/pywikipedia/pywikibot/__init__.py 2010-02-09 14:34:01 UTC (rev 7920) @@ -16,7 +16,9 @@
import wikipedia
+link_regex = re.compile(r'[[(?P<title>[^]|[#<>{}]*)(|.*?)?]]')
+ def showDiff(oldtext, newtext): """ Output a string showing the differences between oldtext and newtext.
Modified: trunk/pywikipedia/pywikibot/textlib.py =================================================================== --- trunk/pywikipedia/pywikibot/textlib.py 2010-02-08 15:37:02 UTC (rev 7919) +++ trunk/pywikipedia/pywikibot/textlib.py 2010-02-09 14:34:01 UTC (rev 7920) @@ -198,14 +198,19 @@ 'parts' parameter, which defaults to all. """ regexes = { - 'comments' : r'<!--.*?-->', - 'includeonly': r'<includeonly>.*?</includeonly>', - 'nowiki': r'<nowiki>.*?</nowiki>', - 'pre': r'<pre>.*?</pre>', - 'source': r'<source .*?</source>', + 'comments' : r'<!--.*?-->', + 'includeonly': r'<includeonly>.*?</includeonly>', + 'nowiki': r'<nowiki>.*?</nowiki>', + 'pre': r'<pre>.*?</pre>', + 'source': r'<source .*?</source>', + 'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>', } if '*' in tags: tags = regexes.keys() + # add alias + tags = set(tags) + if 'source' in tags: + tags.add('syntaxhighlight') toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]), re.IGNORECASE | re.DOTALL) return toRemoveR.sub('', text) @@ -254,9 +259,9 @@ marker = text[firstinseparator:firstinmarker] + marker return marker
- +#------------------------------------------------- # Functions dealing with interwiki language links - +#------------------------------------------------- # Note - MediaWiki supports two kinds of interwiki links; interlanguage and # interproject. These functions only deal with links to a # corresponding page in another language on the same project (e.g., @@ -302,8 +307,8 @@ site = insite.getSite(code = lang) try: result[site] = pywikibot.Page(site, pagetitle, insite = insite) - except InvalidTitle: - output( + except pywikibot.InvalidTitle: + pywikibot.output( u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]." % (lang, pagetitle)) continue @@ -486,8 +491,9 @@ sites = insite.interwiki_putfirst_doubled(sites) + sites return sites
- +#--------------------------------------- # Functions dealing with category links +#---------------------------------------
def getCategoryLinks(text, site): import catlib @@ -665,6 +671,9 @@ #catLinks.sort() return sep.join(catLinks) + '\r\n'
+#--------------------------------------- +# Functions dealing with external links +#---------------------------------------
def compileLinkR(withoutBracketed=False, onlyBracketed=False): """Return a regex that matches external links.""" @@ -695,6 +704,9 @@ linkR = re.compile(regex) return linkR
+#---------------------------------- +# Functions dealing with templates +#----------------------------------
def extract_templates_and_params(text, get_redirect=False): """Return list of template calls found in text. @@ -805,7 +817,9 @@ result.append((name, params)) return result
+#---------------- # I18N functions +#----------------
# Languages to use for comment text after the actual language but before # en:. For example, if for language 'xx', you want the preference of
Modified: trunk/pywikipedia/solve_disambiguation.py =================================================================== --- trunk/pywikipedia/solve_disambiguation.py 2010-02-08 15:37:02 UTC (rev 7919) +++ trunk/pywikipedia/solve_disambiguation.py 2010-02-09 14:34:01 UTC (rev 7920) @@ -266,12 +266,12 @@ u'Benutzer:SrbBot.*', u'Benutzer:PortalBot/.+', u'Benutzer:Xqbot/.+', - u'Benutzer Diskussion:.+', u'Lehnwort', u'Liste griechischer Wortstämme in deutschen Fremdwörtern', u'Liste von Gräzismen', u'Portal:Abkürzungen/.+', u'Portal:Astronomie/Moves', + u'Portal:Astronomie/Index/.+', u'Wikipedia:Administratoren/Anfragen', u'Wikipedia:Archiv/.+', u'Wikipedia:Artikelwünsche/Ding-Liste/[A-Z]',
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2010-02-08 15:37:02 UTC (rev 7919) +++ trunk/pywikipedia/wikipedia.py 2010-02-09 14:34:01 UTC (rev 7920) @@ -7437,10 +7437,12 @@ special redirect tag.
""" - if default: - return self.family.redirect.get(self.lang, [u"REDIRECT"])[0] - else: - return self.family.redirect.get(self.lang, None) + tag = self.siteinfo('magicwords').get('redirect')[0][1:] + if tag: + # remove first "#" letter + return tag[0][1:] + elif default: + return u'REDIRECT'
def redirectRegex(self): """Return a compiled regular expression matching on redirect pages. @@ -7448,24 +7450,23 @@ Group 1 in the regex match object will be the target title.
""" - + #NOTE: this is needed, since the API can give false positives! + default = 'REDIRECT' try: - redirKeywords = [u'redirect'] + self.family.redirect[self.lang] - redirKeywordsR = r'(?:' + '|'.join(redirKeywords) + ')' + keywords = self.siteinfo('magicwords')['redirect'] + pattern = r'(?:' + '|'.join(keywords) + ')' except KeyError: # no localized keyword for redirects - redirKeywordsR = r'redirect' - - # A redirect starts with hash (#), followed by a keyword, then - # arbitrary stuff, then a wikilink. The wikilink may contain - # a label, although this is not useful. - + pattern = r'#%s' % default if self.versionnumber() > 12: # in MW 1.13 (at least) a redirect directive can follow whitespace prefix = r'\s*' else: prefix = r'[\r\n]*' - return re.compile(prefix + '#' + redirKeywordsR + # A redirect starts with hash (#), followed by a keyword, then + # arbitrary stuff, then a wikilink. The wikilink may contain + # a label, although this is not useful. + return re.compile(prefix + pattern + '\s*:?\s*[[(.+?)(?:|.*?)?]]', re.IGNORECASE | re.UNICODE | re.DOTALL)
pywikipedia-svn@lists.wikimedia.org