SVN: [4464] trunk/pywikipedia/fixes.py - Pywikipedia-l

17 Oct 2007

Revision: 4464
Author:   siebrand
Date:     2007-10-17 20:04:39 +0000 (Wed, 17 Oct 2007)

Log Message:
-----------
* Add fixes for Arabic wikis
* Fix indentation

Modified Paths:
--------------
    trunk/pywikipedia/fixes.py

Modified: trunk/pywikipedia/fixes.py
===================================================================

--- trunk/pywikipedia/fixes.py	2007-10-17 19:27:12 UTC (rev 4463)
+++ trunk/pywikipedia/fixes.py	2007-10-17 20:04:39 UTC (rev 4464)
@@ -16,328 +16,412 @@
 """
 
 fixes = {
-    # These replacements will convert HTML to wiki syntax where possible, and
-    # make remaining tags XHTML compliant.
-    'HTML': {
-        'regex': True,
-        'msg': {
-               'en':u'Robot: converting/fixing HTML',
-               'de':u'Bot: konvertiere/korrigiere HTML',
-               'fr':u'Robot: convertit/fixe HTML',
-               'he':u'רובוט: ממיר/מתקן HTML',
-               'ia':u'Robot: conversion/reparation de HTML',
-               'lt':u'robotas: konvertuojamas/taisomas HTML',
-               'nl':u'Bot: conversie/reparatie HTML',
-               'pl':u'Robot konwertuje/naprawia HTML',
-               'pt':u'Bot: Corrigindo HTML',
-               'sr':u'Бот: Поправка HTML-а'
-              },
-        'replacements': [
-            # Everything case-insensitive (?i)
-            # Keep in mind that MediaWiki automatically converts <br> to <br
/>
-            # when rendering pages, so you might comment the next two lines out
-            # to save some time/edits.
-            #r'(?i)<br>':                      r'<br />',
-            # linebreak with attributes
-            #r'(?i)<br ([^>/]+?)>':            r'<br \1
/>',
-            (r'(?i)<b>(.*?)</b>',             
r"'''\1'''"),
-            (r'(?i)<strong>(.*?)</strong>',   
r"'''\1'''"),
-            (r'(?i)<i>(.*?)</i>',             
r"''\1''"),
-            (r'(?i)<em>(.*?)</em>',           
r"''\1''"),
-            # horizontal line without attributes in a single line
-            (r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2'),
-            # horizontal line without attributes with more text in the same line
-            #(r'(?i) +<hr[ /]*> +',             r'\r\n----\r\n'),
-            # horizontal line with attributes; can't be done with wiki syntax
-            # so we only make it XHTML compliant
-            (r'(?i)<hr ([^>/]+?)>',            r'<hr \1
/>'),
-            # a header where only spaces are in the same line
-            (r'(?i)([\r\n]) *<h1> *([^<]+?) *</h1> *([\r\n])', 
r"\1= \2 =\3"),
-            (r'(?i)([\r\n]) *<h2> *([^<]+?) *</h2> *([\r\n])', 
r"\1== \2 ==\3"),
-            (r'(?i)([\r\n]) *<h3> *([^<]+?) *</h3> *([\r\n])', 
r"\1=== \2 ===\3"),
-            (r'(?i)([\r\n]) *<h4> *([^<]+?) *</h4> *([\r\n])', 
r"\1==== \2 ====\3"),
-            (r'(?i)([\r\n]) *<h5> *([^<]+?) *</h5> *([\r\n])', 
r"\1===== \2 =====\3"),
-            (r'(?i)([\r\n]) *<h6> *([^<]+?) *</h6> *([\r\n])', 
r"\1====== \2 ======\3"),
-            # TODO: maybe we can make the bot replace <p> tags with \r\n's.
-        ],
-        'exceptions': {
-            'inside-tags': [
-                'nowiki',
-                'comment',
-                'math',
-                'pre'
-            ],
-        }
-    },
-    # Grammar fixes for German language
-    'grammar-de': {
-        'regex': True,
-        'msg': {
-               'de':u'Bot: korrigiere Grammatik',
-              },
-        'replacements': [
-            #(u'([Ss]owohl) ([^,\.]+?), als auch',                               
                            r'\1 \2 als auch'),
-            #(u'([Ww]eder) ([^,\.]+?), noch', r'\1 \2 noch'),
-            # 
-            # Vorsicht bei Substantiven, z. B. 3-Jähriger!
-           
(u'(\d+)(minütig|stündig|tägig|wöchig|jährig|minütlich|stündlich|täglich|wöchentlich|jährlich|fach|mal|malig|köpfig|teilig|gliedrig|geteilt|elementig|dimensional|bändig|eckig|farbig|stimmig)',
r'\1-\2'),
-            # zusammengesetztes Wort, Bindestrich wird durchgeschleift
-           
(u'(?<!\w)(\d+|\d+[\.,]\d+)(\$|€|DM|£|¥|mg|g|kg|ml|cl|l|t|ms|min|µm|mm|cm|dm|m|km|°C|kB|MB|TB|W|kW|MW|PS|Nm|eV|J|kcal|mA|mV|kV|Ω|Hz|kHz|MHz|GHz|mol|Pa|Bq|Sv|mSv)([²³]?-[\w\[])',
          r'\1-\2\3'),
-            # Größenangabe ohne Leerzeichen vor Einheit
-            # weggelassen wegen vieler falsch Positiver: s, A, V, C, S, %
-           
(u'(?<!\w)(\d+|\d+[\.,]\d+)(\$|€|DM|£|¥|mg|g|kg|ml|cl|l|t|ms|min|µm|mm|cm|dm|m|km|°C|kB|MB|TB|W|kW|MW|PS|Nm|eV|J|kcal|mA|mV|kV|Ω|Hz|kHz|MHz|GHz|mol|Pa|Bq|Sv|mSv)(?=\W|²|³|$)',
         r'\1 \2'),
-            # Kein Leerzeichen zwischen Tag und Monat
-           
(u'(\d+)\.(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)',
r'\1. \2'),
-            # Keine führende Null beim Datum
-            #(u'0(\d+)\.
(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)',
r'\1. \2'),
-            # Kein Leerzeichen nach Komma
-            (u'([a-z](\]\])?,)((\[\[)?[a-zA-Z])',                                
                                         r'\1 \3'),
-            # Leerzeichen und Komma vertauscht
-            (u'([a-z](\]\])?) ,((\[\[)?[a-zA-Z])',                               
                                          r'\1, \3'),
-            #(u'([a-z]\.)([A-Z])',                                               
                             r'\1 \2'),
-        ],
-        'exceptions': {
-            'inside-tags': [
-                'nowiki',
-                'comment',
-                'math',
-                'pre',           # because of code examples
-                'startspace',    # because of code examples
-                'hyperlink',     # e.g. commas in URLs
-                'gallery',       # because of filenames
-            ],
-            'text-contains': [
-                r'sic!',
-            ],
-            'inside': [
-                r'Ju 52/3m', # Flugzeugbezeichnung
-                r'AH-1W',    # Hubschrauberbezeichnung
-                r'\d+h \d+m', # Schreibweise für Zeiten, vor allem in
Film-Infoboxen. Nicht korrekt, aber dafür schön kurz.
-                r'(?i)\[\[(Bild|Image|Media):.+?\|', # Dateinamen auslassen
-            ],
-            'title': [
-                r'Arsen',  # chemische Formel
-            ],
-        }
-    },
-    # Do NOT run this automatically!
-    # Recommendation: First run syntax-safe automatically, afterwards
-    # run syntax manually, carefully checking that you're not breaking
-    # anything.
-    'syntax': {
-        'regex': True,
-        'msg': {
-               'de':u'Bot: Korrigiere Wiki-Syntax',
-               'en':u'Bot: Fixing wiki syntax',
-               'fr':u'Bot: Corrige wiki-syntaxe',
-               'he':u'בוט: מתקן תחביר ויקי',
-               'ia':u'Robot: Reparation de syntaxe wiki',
-               'lt':u'robotas: Taisoma wiki sintaksė',
-               'nl':u'Bot: reparatie wikisyntaxis',
-               'pl':u'Robot poprawia wiki-składnię',
-               'pt':u'Bot: Corrigindo sintaxe wiki',
-               'sr':u'Бот: Поправка вики синтаксе',
-              },
-        'replacements': [
-            # external link in double brackets
-            (r'\[\[(?P<url>https?://[^\]]+?)\]\]',  
r'[\g<url>]'),
-            # external link starting with double bracket
-            (r'\[\[(?P<url>https?://.+?)\]',  
r'[\g<url>]'),
-            # external link with forgotten closing bracket
-            #(r'\[(?P<url>https?://[^\]\s]+)\r\n', 
r'[\g<url>]\r\n'),
-            # external link ending with double bracket.
-            # do not change weblinks that contain wiki links inside
-            # inside the description
-            (r'\[(?P<url>https?://[^\[\]]+?)\]\](?!\])',  
r'[\g<url>]'),
-            # external link and description separated by a dash.
-            # ATTENTION: while this is a mistake in most cases, there are some
-            # valid URLs that contain dashes!
-            (r'\[(?P<url>https?://[^\|\]\s]+?) *\|
*(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]'),
-            # wiki link closed by single bracket.
-            # ATTENTION: There are some false positives, for example
-            # Brainfuck code examples or MS-DOS parameter instructions.
-            # There are also sometimes better ways to fix it than
-            # just putting an additional ] after the link.
-            (r'\[\[([^\[\]]+?)\](?!\])',  r'[[\1]]'),
-            # wiki link opened by single bracket.
-            # ATTENTION: same as above.
-            (r'(?<!\[)\[([^\[\]]+?)\]\](?!\])',  r'[[\1]]'),
-            # template closed by single bracket
-            # ATTENTION: There are some false positives, especially in
-            # mathematical context or program code.
-            (r'{{([^{}]+?)}(?!})',       r'{{\1}}'),
-        ],
-        'exceptions': {
-            'inside-tags': [
-                'nowiki',
-                'comment',
-                'math',
-                'pre',
-            ],
-            'text-contains': [
-                r'http://.*?object=tx\|',               # regular dash in URL
-                r'http://.*?allmusic\.com',             # regular dash in URL
-                r'http://.*?allmovie\.com',             # regular dash in URL
-                r'http://physics.nist.gov/'ov/',            # regular dash in URL
-                r'http://www.forum-seniorenarbeit.de/', # regular dash in URL
-                r'http://kuenstlerdatenbank.ifa.de/',   # regular dash in URL
-                r'&object=med',                         # regular dash in
URL
-                r'\[CDATA\['                            # lots of brackets
-            ],
-        }
-    },
-    # The same as syntax, but restricted to replacements that should
-    # be safe to run automatically.
-    'syntax-safe': {
-        'regex': True,
-        'msg': {
-               'de':u'Bot: Korrigiere Wiki-Syntax',
-               'en':u'Bot: Fixing wiki syntax',
-               'fr':u'Bot: Corrige wiki-syntaxe',
-               'he':u'בוט: מתקן תחביר ויקי',
-               'ia':u'Robot: Reparation de syntaxe wiki',
-               'lt':u'robotas: Taisoma wiki sintaksė',
-               'nl':u'Bot: reparatie wikisyntaxis',
-               'pl':u'Robot poprawia wiki-składnię',
-               'pt':u'Bot: Corrigindo sintaxe wiki',
-               'sr':u'Бот: Поправка вики синтаксе',
-              },
-        'replacements': [
-            # external link in double brackets
-            (r'\[\[(?P<url>https?://[^\]]+?)\]\]',  
r'[\g<url>]'),
-            # external link starting with double bracket
-            (r'\[\[(?P<url>https?://.+?)\]',  
r'[\g<url>]'),
-            # external link with forgotten closing bracket
-            #(r'\[(?P<url>https?://[^\]\s]+)\r\n',  
r'[\g<url>]\r\n'),
-             # external link and description separated by a dash, with
-             # whitespace in front of the dash, so that it is clear that
-             # the dash is not a legitimate part of the URL.
-            (r'\[(?P<url>https?://[^\|\] \r\n]+?) +\|
*(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]'),
-            # dash in external link, where the correct end of the URL can
-            # be detected from the file extension. It is very unlikely that
-            # this will cause mistakes.
-            (r'\[(?P<url>https?://[^\|\]
]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]',
r'[\g<url> \g<label>]'),
-        ],
-    },
-    'case-de': { # German upper / lower case issues
-        'regex': True,
-        'msg': {
-               'de':u'Bot: Korrigiere Groß-/Kleinschreibung',
-              },
-        'replacements': [
-            (r'\batlantische(r|n|) Ozean', r'Atlantische\1 Ozean'),
-            (r'\bdeutsche(r|n|) Bundestag\b', r'Deutsche\1 Bundestag'),
-            (r'\bdeutschen Bundestags\b', r'Deutschen Bundestags'), #
Aufpassen, z. B. 'deutsche Bundestagswahl'
-            (r'\bdeutsche(r|n|) Reich\b', r'Deutsche\1 Reich'),
-            (r'\bdeutschen Reichs\b', r'Deutschen Reichs'), # Aufpassen,
z. B. 'deutsche Reichsgrenzen'
-            (r'\bdritte(n|) Welt(?!krieg)', r'Dritte\1 Welt'),
-            (r'\bdreißigjährige(r|n|) Krieg', r'Dreißigjährige\1
Krieg'),
-            (r'\beuropäische(n|) Gemeinschaft', r'Europäische\1
Gemeinschaft'),
-            (r'\beuropäische(n|) Kommission', r'Europäische\1
Kommission'),
-            (r'\beuropäische(n|) Parlament', r'Europäische\1
Parlament'),
-            (r'\beuropäische(n|) Union', r'Europäische\1 Union'),
-            (r'\berste(r|n|) Weltkrieg', r'Erste\1 Weltkrieg'),
-            (r'\bkalter(r|n|) Krieg', r'Kalte\1 Krieg'),
-            (r'\bpazifische(r|n|) Ozean', r'Pazifische\1 Ozean'),
-            (r'Tag der deutschen Einheit', r'Tag der Deutschen
Einheit'),
-            (r'\bzweite(r|n|) Weltkrieg', r'Zweite\1 Weltkrieg'),
-        ],
-        'exceptions': {
-            'inside-tags': [
-                'nowiki',
-                'comment',
-                'math',
-                'pre',
-            ],
-            'text-contains': [
-                r'sic!',
-            ],
-        }
-    },
-    'vonbis': {
-        'regex': True,
-        'msg': {
-            'de':u'Bot: Ersetze Binde-/Gedankenstrich durch
"bis"',
-        },
-        'replacements': [
-            # Bindestrich, Gedankenstrich, Geviertstrich
-            (u'(von \d{3,4}) *(-|&ndash;|–|&mdash;|—) *(\d{3,4})',
r'\1 bis \3'),
-        ],
-    },
-    # some disambiguation stuff for de:
-    # python replace.py -fix:music -subcat:Album
-    'music': {
-        'regex': False,
-        'msg': {
-            'de':u'Bot: korrigiere Links auf Begriffsklärungen',
-        },
-        'replacements': [
-            (u'[[CD]]', u'[[Audio-CD|CD]]'),
-            (u'[[LP]]', u'[[Langspielplatte|LP]]'),
-            (u'[[EP]]', u'[[Extended Play|EP]]'),
-            (u'[[MC]]', u'[[Musikkassette|MC]]'),
-            (u'[[Single]]', u'[[Single (Musik)|Single]]'),
-        ],
-        'exceptions': {
-            'inside-tags': [
-                'hyperlink',
-            ]
-        }
+	# These replacements will convert HTML to wiki syntax where possible, and
+	# make remaining tags XHTML compliant.
+	'HTML': {
+		'regex': True,
+		'msg': {
+			'en':u'Robot: converting/fixing HTML',
+			'de':u'Bot: konvertiere/korrigiere HTML',
+			'fr':u'Robot: convertit/fixe HTML',
+			'he':u'רובוט: ממיר/מתקן HTML',
+			'ia':u'Robot: conversion/reparation de HTML',
+			'lt':u'robotas: konvertuojamas/taisomas HTML',
+			'nl':u'Bot: conversie/reparatie HTML',
+			'pl':u'Robot konwertuje/naprawia HTML',
+			'pt':u'Bot: Corrigindo HTML',
+			'sr':u'Бот: Поправка HTML-а'
+		},
+		'replacements': [
+			# Everything case-insensitive (?i)
+			# Keep in mind that MediaWiki automatically converts <br> to <br />
+			# when rendering pages, so you might comment the next two lines out
+			# to save some time/edits.
+			#r'(?i)<br>':                      r'<br />',
+			# linebreak with attributes
+			#r'(?i)<br ([^>/]+?)>':            r'<br \1 />',
+			(r'(?i)<b>(.*?)</b>',             
r"'''\1'''"),
+			(r'(?i)<strong>(.*?)</strong>',   
r"'''\1'''"),
+			(r'(?i)<i>(.*?)</i>',             
r"''\1''"),
+			(r'(?i)<em>(.*?)</em>',           
r"''\1''"),
+			# horizontal line without attributes in a single line
+			(r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2'),
+			# horizontal line without attributes with more text in the same line
+			#(r'(?i) +<hr[ /]*> +',             r'\r\n----\r\n'),
+			# horizontal line with attributes; can't be done with wiki syntax
+			# so we only make it XHTML compliant
+			(r'(?i)<hr ([^>/]+?)>',            r'<hr \1 />'),
+			# a header where only spaces are in the same line
+			(r'(?i)([\r\n]) *<h1> *([^<]+?) *</h1> *([\r\n])',  r"\1=
\2 =\3"),
+			(r'(?i)([\r\n]) *<h2> *([^<]+?) *</h2> *([\r\n])',  r"\1==
\2 ==\3"),
+			(r'(?i)([\r\n]) *<h3> *([^<]+?) *</h3> *([\r\n])', 
r"\1=== \2 ===\3"),
+			(r'(?i)([\r\n]) *<h4> *([^<]+?) *</h4> *([\r\n])', 
r"\1==== \2 ====\3"),
+			(r'(?i)([\r\n]) *<h5> *([^<]+?) *</h5> *([\r\n])', 
r"\1===== \2 =====\3"),
+			(r'(?i)([\r\n]) *<h6> *([^<]+?) *</h6> *([\r\n])', 
r"\1====== \2 ======\3"),
+			# TODO: maybe we can make the bot replace <p> tags with \r\n's.
+		],
+		'exceptions': {
+			'inside-tags': [
+				'nowiki',
+				'comment',
+				'math',
+				'pre'
+			],
+		}
+	},
 
-    },
-    # format of dates of birth and death, for de:
-    # python replace.py -fix:datum -ref:Vorlage:Personendaten
-    'datum': {
-        'regex': True,
-        'msg': {
-            'de': u'Bot: Korrigiere Datumsformat',
-        },
-        'replacements': [
-            # space after birth sign w/ year
-            #(u'\(\*(\d{3,4})', u'(* \\1'),
-            ## space after death sign w/ year
-            #(u'†(\d{3,4})', u'† \\1'),
-            #(u'&dagger;(\d{3,4})', u'† \\1'),
-            ## space after birth sign w/ linked date
-            #(u'\(\*\[\[(\d)', u'(* [[\\1'),
-            ## space after death sign w/ linked date
-            #(u'†\[\[(\d)', u'† [[\\1'),
-            #(u'&dagger;\[\[(\d)', u'† [[\\1'),
-            (u'\[\[(\d+\.
(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))
(\d{1,4})\]\]', u'[[\\1]] [[\\2]]'),
-        ],
-        'exceptions': {
-            'inside': [
-                r'[[20. Juli 1944]]',
-                r'[[17. Juni 1953]]',
-                r'[[11. September 2001]]',
-            ],
-        }
-    },
-    'isbn': {
-        'regex': True,
-        'msg': {
-            'de': u'Bot: Korrigiere ISBN-Format',
-            'en': u'Robot: Fixing ISBN format',
-            'es': u'Arreglando formato ISBN'
-        },
-        'replacements': [
-             # colon
-            (r'ISBN: (\d+)', r'ISBN \1'),
-            # Spaces, dashes, or dots instead of hyphens as separators,
-            # or spaces between digits and separators.
-            # Note that these regular expressions also match valid ISBNs, but
-            # these won't be changed.
-            (ur'ISBN (978|979) *[\- –\.] *(\d+) *[\- –\.] *(\d+) *[\- –\.] *(\d+)
*[\- –\.] *(\d)(?!\d)', r'ISBN \1-\2-\3-\4-\5'), # ISBN13
-            (r'ISBN (\d+) *[\- –\.] *(\d+) *[\- –\.] *(\d+) *[\- –\.]
*(\d|X|x)(?!\d)', r'ISBN \1-\2-\3-\4'), # ISBN10
-        ],
-        'exceptions': {
-            'inside-tags': [
-                'comment',
-            ],
-            'inside': [
-                r'ISBN (\d(-?)){12}\d',    # matches valid ISBN-13s
-                r'ISBN (\d(-?)){9}[\dXx]', # matches valid ISBN-10s
-            ],
-        }
-    },
+	# Grammar fixes for German language
+	'grammar-de': {
+		'regex': True,
+		'msg': {
+			'de':u'Bot: korrigiere Grammatik',
+		},
+		'replacements': [
+			#(u'([Ss]owohl) ([^,\.]+?), als auch',                                        
                   r'\1 \2 als auch'),
+			#(u'([Ww]eder) ([^,\.]+?), noch', r'\1 \2 noch'),
+			#
+			# Vorsicht bei Substantiven, z. B. 3-Jähriger!
+			(u'(\d+)(minütig|stündig|tägig|wöchig|jährig|minütlich|stündlich|täglich|wöchentlich|jährlich|fach|mal|malig|köpfig|teilig|gliedrig|geteilt|elementig|dimensional|bändig|eckig|farbig|stimmig)',
r'\1-\2'),
+			# zusammengesetztes Wort, Bindestrich wird durchgeschleift
+			(u'(?<!\w)(\d+|\d+[\.,]\d+)(\$|€|DM|£|¥|mg|g|kg|ml|cl|l|t|ms|min|µm|mm|cm|dm|m|km|°C|kB|MB|TB|W|kW|MW|PS|Nm|eV|J|kcal|mA|mV|kV|Ω|Hz|kHz|MHz|GHz|mol|Pa|Bq|Sv|mSv)([²³]?-[\w\[])',
          r'\1-\2\3'),
+			# Größenangabe ohne Leerzeichen vor Einheit
+			# weggelassen wegen vieler falsch Positiver: s, A, V, C, S, %
+			(u'(?<!\w)(\d+|\d+[\.,]\d+)(\$|€|DM|£|¥|mg|g|kg|ml|cl|l|t|ms|min|µm|mm|cm|dm|m|km|°C|kB|MB|TB|W|kW|MW|PS|Nm|eV|J|kcal|mA|mV|kV|Ω|Hz|kHz|MHz|GHz|mol|Pa|Bq|Sv|mSv)(?=\W|²|³|$)',
         r'\1 \2'),
+			# Kein Leerzeichen zwischen Tag und Monat
+			(u'(\d+)\.(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)',
r'\1. \2'),
+			# Keine führende Null beim Datum
+			#(u'0(\d+)\.
(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)',
r'\1. \2'),
+			# Kein Leerzeichen nach Komma
+			(u'([a-z](\]\])?,)((\[\[)?[a-zA-Z])',                                         
                                r'\1 \3'),
+			# Leerzeichen und Komma vertauscht
+			(u'([a-z](\]\])?) ,((\[\[)?[a-zA-Z])',                                        
                                 r'\1, \3'),
+			#(u'([a-z]\.)([A-Z])',                                                        
                    r'\1 \2'),
+		],
+		'exceptions': {
+			'inside-tags': [
+				'nowiki',
+				'comment',
+				'math',
+				'pre',           # because of code examples
+				'startspace',    # because of code examples
+				'hyperlink',     # e.g. commas in URLs
+				'gallery',       # because of filenames
+			],
+			'text-contains': [
+				r'sic!',
+			],
+			'inside': [
+				r'Ju 52/3m', # Flugzeugbezeichnung
+				r'AH-1W',    # Hubschrauberbezeichnung
+				r'\d+h \d+m', # Schreibweise für Zeiten, vor allem in Film-Infoboxen. Nicht
korrekt, aber dafür schön kurz.
+				r'(?i)\[\[(Bild|Image|Media):.+?\|', # Dateinamen auslassen
+			],
+			'title': [
+				r'Arsen',  # chemische Formel
+			],
+		}
+	},
+
+	# Do NOT run this automatically!
+	# Recommendation: First run syntax-safe automatically, afterwards
+	# run syntax manually, carefully checking that you're not breaking
+	# anything.
+	'syntax': {
+		'regex': True,
+		'msg': {
+			'de':u'Bot: Korrigiere Wiki-Syntax',
+			'en':u'Bot: Fixing wiki syntax',
+			'fr':u'Bot: Corrige wiki-syntaxe',
+			'he':u'בוט: מתקן תחביר ויקי',
+			'ia':u'Robot: Reparation de syntaxe wiki',
+			'lt':u'robotas: Taisoma wiki sintaksė',
+			'nl':u'Bot: reparatie wikisyntaxis',
+			'pl':u'Robot poprawia wiki-składnię',
+			'pt':u'Bot: Corrigindo sintaxe wiki',
+			'sr':u'Бот: Поправка вики синтаксе',
+		},
+		'replacements': [
+			# external link in double brackets
+			(r'\[\[(?P<url>https?://[^\]]+?)\]\]',   r'[\g<url>]'),
+			# external link starting with double bracket
+			(r'\[\[(?P<url>https?://.+?)\]',   r'[\g<url>]'),
+			# external link with forgotten closing bracket
+			#(r'\[(?P<url>https?://[^\]\s]+)\r\n', 
r'[\g<url>]\r\n'),
+			# external link ending with double bracket.
+			# do not change weblinks that contain wiki links inside
+			# inside the description
+			(r'\[(?P<url>https?://[^\[\]]+?)\]\](?!\])',  
r'[\g<url>]'),
+			# external link and description separated by a dash.
+			# ATTENTION: while this is a mistake in most cases, there are some
+			# valid URLs that contain dashes!
+			(r'\[(?P<url>https?://[^\|\]\s]+?) *\| *(?P<label>[^\|\]]+?)\]',
r'[\g<url> \g<label>]'),
+			# wiki link closed by single bracket.
+			# ATTENTION: There are some false positives, for example
+			# Brainfuck code examples or MS-DOS parameter instructions.
+			# There are also sometimes better ways to fix it than
+			# just putting an additional ] after the link.
+			(r'\[\[([^\[\]]+?)\](?!\])',  r'[[\1]]'),
+			# wiki link opened by single bracket.
+			# ATTENTION: same as above.
+			(r'(?<!\[)\[([^\[\]]+?)\]\](?!\])',  r'[[\1]]'),
+			# template closed by single bracket
+			# ATTENTION: There are some false positives, especially in
+			# mathematical context or program code.
+			(r'{{([^{}]+?)}(?!})',       r'{{\1}}'),
+		],
+		'exceptions': {
+			'inside-tags': [
+				'nowiki',
+				'comment',
+				'math',
+				'pre',
+			],
+			'text-contains': [
+				r'http://.*?object=tx\|',               # regular dash in URL
+				r'http://.*?allmusic\.com',             # regular dash in URL
+				r'http://.*?allmovie\.com',             # regular dash in URL
+				r'http://physics.nist.gov/'ov/',            # regular dash in URL
+				r'http://www.forum-seniorenarbeit.de/', # regular dash in URL
+				r'http://kuenstlerdatenbank.ifa.de/',   # regular dash in URL
+				r'&object=med',                         # regular dash in URL
+				r'\[CDATA\['                            # lots of brackets
+			],
+		}
+	},
+
+	# The same as syntax, but restricted to replacements that should
+	# be safe to run automatically.
+	'syntax-safe': {
+		'regex': True,
+		'msg': {
+			'de':u'Bot: Korrigiere Wiki-Syntax',
+			'en':u'Bot: Fixing wiki syntax',
+			'fr':u'Bot: Corrige wiki-syntaxe',
+			'he':u'בוט: מתקן תחביר ויקי',
+			'ia':u'Robot: Reparation de syntaxe wiki',
+			'lt':u'robotas: Taisoma wiki sintaksė',
+			'nl':u'Bot: reparatie wikisyntaxis',
+			'pl':u'Robot poprawia wiki-składnię',
+			'pt':u'Bot: Corrigindo sintaxe wiki',
+			'sr':u'Бот: Поправка вики синтаксе',
+		},
+		'replacements': [
+			# external link in double brackets
+			(r'\[\[(?P<url>https?://[^\]]+?)\]\]',   r'[\g<url>]'),
+			# external link starting with double bracket
+			(r'\[\[(?P<url>https?://.+?)\]',   r'[\g<url>]'),
+			# external link with forgotten closing bracket
+			#(r'\[(?P<url>https?://[^\]\s]+)\r\n',  
r'[\g<url>]\r\n'),
+			# external link and description separated by a dash, with
+			# whitespace in front of the dash, so that it is clear that
+			# the dash is not a legitimate part of the URL.
+			(r'\[(?P<url>https?://[^\|\] \r\n]+?) +\|
*(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]'),
+			# dash in external link, where the correct end of the URL can
+			# be detected from the file extension. It is very unlikely that
+			# this will cause mistakes.
+			(r'\[(?P<url>https?://[^\|\]
]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]',
r'[\g<url> \g<label>]'),
+		],
+	},
+
+	'case-de': { # German upper / lower case issues
+		'regex': True,
+		'msg': {
+			'de':u'Bot: Korrigiere Groß-/Kleinschreibung',
+		},
+		'replacements': [
+			(r'\batlantische(r|n|) Ozean', r'Atlantische\1 Ozean'),
+			(r'\bdeutsche(r|n|) Bundestag\b', r'Deutsche\1 Bundestag'),
+			(r'\bdeutschen Bundestags\b', r'Deutschen Bundestags'), # Aufpassen,
z. B. 'deutsche Bundestagswahl'
+			(r'\bdeutsche(r|n|) Reich\b', r'Deutsche\1 Reich'),
+			(r'\bdeutschen Reichs\b', r'Deutschen Reichs'), # Aufpassen, z. B.
'deutsche Reichsgrenzen'
+			(r'\bdritte(n|) Welt(?!krieg)', r'Dritte\1 Welt'),
+			(r'\bdreißigjährige(r|n|) Krieg', r'Dreißigjährige\1 Krieg'),
+			(r'\beuropäische(n|) Gemeinschaft', r'Europäische\1 Gemeinschaft'),
+			(r'\beuropäische(n|) Kommission', r'Europäische\1 Kommission'),
+			(r'\beuropäische(n|) Parlament', r'Europäische\1 Parlament'),
+			(r'\beuropäische(n|) Union', r'Europäische\1 Union'),
+			(r'\berste(r|n|) Weltkrieg', r'Erste\1 Weltkrieg'),
+			(r'\bkalter(r|n|) Krieg', r'Kalte\1 Krieg'),
+			(r'\bpazifische(r|n|) Ozean', r'Pazifische\1 Ozean'),
+			(r'Tag der deutschen Einheit', r'Tag der Deutschen Einheit'),
+			(r'\bzweite(r|n|) Weltkrieg', r'Zweite\1 Weltkrieg'),
+		],
+		'exceptions': {
+			'inside-tags': [
+				'nowiki',
+				'comment',
+				'math',
+				'pre',
+			],
+			'text-contains': [
+				r'sic!',
+			],
+		}
+	},
+
+	'vonbis': {
+		'regex': True,
+		'msg': {
+			'de':u'Bot: Ersetze Binde-/Gedankenstrich durch "bis"',
+		},
+		'replacements': [
+			# Bindestrich, Gedankenstrich, Geviertstrich
+			(u'(von \d{3,4}) *(-|&ndash;|–|&mdash;|—) *(\d{3,4})', r'\1 bis
\3'),
+		],
+	},
+
+	# some disambiguation stuff for de:
+	# python replace.py -fix:music -subcat:Album
+	'music': {
+		'regex': False,
+		'msg': {
+			'de':u'Bot: korrigiere Links auf Begriffsklärungen',
+		},
+		'replacements': [
+			(u'[[CD]]', u'[[Audio-CD|CD]]'),
+			(u'[[LP]]', u'[[Langspielplatte|LP]]'),
+			(u'[[EP]]', u'[[Extended Play|EP]]'),
+			(u'[[MC]]', u'[[Musikkassette|MC]]'),
+			(u'[[Single]]', u'[[Single (Musik)|Single]]'),
+		],
+		'exceptions': {
+			'inside-tags': [
+				'hyperlink',
+			]
+		}
+	},
+
+	# format of dates of birth and death, for de:
+	# python replace.py -fix:datum -ref:Vorlage:Personendaten
+	'datum': {
+		'regex': True,
+		'msg': {
+			'de': u'Bot: Korrigiere Datumsformat',
+		},
+		'replacements': [
+			# space after birth sign w/ year
+			#(u'\(\*(\d{3,4})', u'(* \\1'),
+			## space after death sign w/ year
+			#(u'†(\d{3,4})', u'† \\1'),
+			#(u'&dagger;(\d{3,4})', u'† \\1'),
+			## space after birth sign w/ linked date
+			#(u'\(\*\[\[(\d)', u'(* [[\\1'),
+			## space after death sign w/ linked date
+			#(u'†\[\[(\d)', u'† [[\\1'),
+			#(u'&dagger;\[\[(\d)', u'† [[\\1'),
+			(u'\[\[(\d+\.
(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))
(\d{1,4})\]\]', u'[[\\1]] [[\\2]]'),
+		],
+		'exceptions': {
+			'inside': [
+				r'[[20. Juli 1944]]',
+				r'[[17. Juni 1953]]',
+				r'[[11. September 2001]]',
+			],
+		}
+	},
+
+	'isbn': {
+		'regex': True,
+		'msg': {
+			'de': u'Bot: Korrigiere ISBN-Format',
+			'en': u'Robot: Fixing ISBN format',
+			'es': u'Arreglando formato ISBN'
+		},
+		'replacements': [
+			# colon
+			(r'ISBN: (\d+)', r'ISBN \1'),
+			# Spaces, dashes, or dots instead of hyphens as separators,
+			# or spaces between digits and separators.
+			# Note that these regular expressions also match valid ISBNs, but
+			# these won't be changed.
+			(ur'ISBN (978|979) *[\- –\.] *(\d+) *[\- –\.] *(\d+) *[\- –\.] *(\d+) *[\- –\.]
*(\d)(?!\d)', r'ISBN \1-\2-\3-\4-\5'), # ISBN13
+			(r'ISBN (\d+) *[\- –\.] *(\d+) *[\- –\.] *(\d+) *[\- –\.] *(\d|X|x)(?!\d)',
r'ISBN \1-\2-\3-\4'), # ISBN10
+		],
+		'exceptions': {
+			'inside-tags': [
+				'comment',
+			],
+			'inside': [
+				r'ISBN (\d(-?)){12}\d',    # matches valid ISBN-13s
+				r'ISBN (\d(-?)){9}[\dXx]', # matches valid ISBN-10s
+			],
+		}
+	},
+
+	#Corrections for Arabic Wikipedia
+	#And any Arabic wiki.
+	#python replace.py -always -start:! -fix:correct-ar
+	'correct-ar': {
+		'regex': False,
+		'msg': {
+			'ar':u'تدقيق إملائي. 64 كلمة مستهدفة حالياً.',
+		},
+		'replacements': [
+			(u' ,', u' ،'),
+			(u' إمرأة ', u' امرأة '),
+			(u' الى ', u' إلى '),
+			(u' إسم ', u' اسم '),
+			(u' الأن ', u' الآن '),
+			(u' اول ', u' أول '),
+			(u' الة ', u' آلة '),
+			(u' فى ', u' في '),
+			(u' اثقل ', u' أثقل '),
+			(u' إبن ', u' ابن '),
+			(u' إبنة ', u' ابنة '),
+			(u' إقتصاد ', u' اقتصاد '),
+			(u' إجتماع ', u' اجتماع '),
+			(u' انجيل ', u' إنجيل '),
+			(u' اجماع ', u' إجماع '),
+			(u' امريكا ', u' أمريكا '),
+			(u' اوروبا ', u' أوروبا '),
+			(u' انجلترا ', u' إنجلترا '),
+			(u' اكتوبر ', u' أكتوبر '),
+			(u' اسرائيل ', u' إسرائيل '),
+			(u' المانيا ', u' ألمانيا '),
+			(u' ايطاليا ', u' إيطاليا '),
+			(u' ايران ', u' إيران '),
+			(u' إستخراج ', u' استخراج '),
+			(u' إستعمال ', u' استعمال '),
+			(u' إستبدال ', u' استبدال '),
+			(u' إشتراك ', u' اشتراك '),
+			(u' إستعادة ', u' استعادة '),
+			(u' إستقلال ', u' استقلال '),
+			(u' إنتقال ', u' انتقال '),
+			(u' إتحاد ', u' اتحاد '),
+			(u' املاء ', u' إملاء '),
+			(u' إستخدام ', u' استخدام '),
+			(u' أحدى ', u' إحدى '),
+			(u' لاكن ', u' لكن '),
+			(u' الاردن ', u' الأردن '),
+			(u' إثنان ', u' اثنان '),
+			(u' شيئ ', u' شيء '),
+			(u' إحتياط ', u' احتياط '),
+			(u' إقتباس ', u' اقتباس '),
+			(u' الامارات ', u' الإمارات '),
+			(u' اكثر ', u' أكثر '),
+			(u' افضل ', u' أفضل '),
+			(u' اكبر ', u' أكبر '),
+			(u' اشهر ', u' أشهر '),
+			(u' ادارة ', u' إدارة '),
+			(u' ابناء ', u' أبناء '),
+			(u' الانصار ', u'  الأنصار '),
+			(u' اشارة ', u' إشارة '),
+			(u' إقرأ ', u' اقرأ '),
+			(u' إمتياز ', u' امتياز '),
+			(u' ارق ', u' أرق '),
+			(u' أرثوذوكس ', u' أرثوذكس '),
+			(u' الأرثوذوكس ', u' الأرثوذكس '),
+			(u' أرثوذوكسية ', u' أرثوذكسية '),
+			(u' الأرثوذوكسية ', u' الأرثوذكسية '),
+			(u' الأرثوذوكسي ', u' الأرثوذكسي '),
+			(u' ارثوذوكس ', u' أرثوذكس '),
+			(u' ارثوذوكسي ', u' أرثوذكسي '),
+			(u' ارثوذوكسية ', u' أرثوذكسية '),
+			(u' الارثوذوكسية ', u' الأرثوذكسية '),
+			(u' اللة ', u' الله '),
+			(u' إختبار ', u' اختبار '),
+			(u'== روابط خارجية ==', u'== وصلات خارجية =='),
+			(u'==روابط خارجية==', u'== وصلات خارجية =='),
+		]
+	},
 }