[Gerrit] proposed bug fixing of https://sourceforge.net/p/pywikipedia... - change (pywikibot/compat) - Pywikibot-commits

15 Sep 2013

Xqt has submitted this change and it was merged.
Change subject: proposed bug fixing of https://sourceforge.net/p/pywikipediabot/bugs/1482/ (archivebot.py)
......................................................................
proposed bug fixing of https://sourceforge.net/p/pywikipediabot/bugs/1482/ (archivebot.py)
Change-Id: I20aa6d57c51c230f2adb4408ffc9bcf27f1e6bca
---
M archivebot.py
1 file changed, 22 insertions(+), 14 deletions(-)
Approvals:
  Xqt: Looks good to me, approved

diff --git a/archivebot.py b/archivebot.py
index d6483a5..d0f79ee 100644
--- a/archivebot.py
+++ b/archivebot.py
@@ -81,7 +81,7 @@
 import pagegenerators, query
 Site = pywikibot.getSite()
-import os, re, time, locale, traceback, string, urllib
+import os, re, time, locale, traceback, string, urllib, unicodedata
try: #Get a constructor for the MD5 hash object
     import hashlib
@@ -234,33 +234,41 @@
         if not TM:
             TM = re.search(r'(\d\d?) (\S+) (\d\d\d\d) (\d\d):(\d\d) (.*?)', line)
         if TM:
-            TIME = txt2timestamp(TM.group(0),"%d. %b %Y kl. %H:%M (%Z)")
+            # Strip away all diacritics in the Mn ('Mark, non-spacing') category
+            # NFD decomposition splits combined characters (e.g. 'ä", LATIN SMALL
+            # LETTER A WITH DIAERESIS) into two entities: LATIN SMALL LETTER A
+            # and COMBINING DIAERESIS. The latter falls in the Mn category and is
+            # filtered out, resuling in 'a'.
+            _TM = ''.join(c for c in unicodedata.normalize('NFD', TM.group(0))
+                    if unicodedata.category(c) != 'Mn')
+
+            TIME = txt2timestamp(_TM,"%d. %b %Y kl. %H:%M (%Z)")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0), "%Y. %B %d., %H:%M (%Z)")
+                TIME = txt2timestamp(_TM, "%Y. %B %d., %H:%M (%Z)")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0), "%d. %b %Y kl.%H:%M (%Z)")
+                TIME = txt2timestamp(_TM, "%d. %b %Y kl.%H:%M (%Z)")
             if not TIME:
-                TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', TM.group(0)),
+                TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', _TM),
                                      "%H:%M, %d %B %Y")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0), "%H:%M, %d %b %Y (%Z)")
+                TIME = txt2timestamp(_TM, "%H:%M, %d %b %Y (%Z)")
             if not TIME:
-                TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', TM.group(0)),
+                TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', _TM),
                                      "%H:%M, %d %b %Y")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0), "%H:%M, %b %d %Y (%Z)")
+                TIME = txt2timestamp(_TM, "%H:%M, %b %d %Y (%Z)")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0), "%H:%M, %B %d %Y (%Z)")
+                TIME = txt2timestamp(_TM, "%H:%M, %B %d %Y (%Z)")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0), "%H:%M, %b %d, %Y (%Z)")
+                TIME = txt2timestamp(_TM, "%H:%M, %b %d, %Y (%Z)")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0), "%H:%M, %B %d, %Y (%Z)")
+                TIME = txt2timestamp(_TM, "%H:%M, %B %d, %Y (%Z)")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0),"%d. %Bta %Y kello %H.%M (%Z)")
+                TIME = txt2timestamp(_TM,"%d. %Bta %Y kello %H.%M (%Z)")
             if not TIME:
-                TIME = txt2timestamp(TM.group(0), "%d %B %Y %H:%M (%Z)")
+                TIME = txt2timestamp(_TM, "%d %B %Y %H:%M (%Z)")
             if not TIME:
-                TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', TM.group(0)),
+                TIME = txt2timestamp(re.sub(' *([^ ]+) *', '', _TM),
                                      "%H:%M, %d. %b. %Y")
             if TIME:
                 self.timestamp = max(self.timestamp, time.mktime(TIME))
-- 
To view, visit https://gerrit.wikimedia.org/r/84204
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I20aa6d57c51c230f2adb4408ffc9bcf27f1e6bca
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/compat
Gerrit-Branch: master
Gerrit-Owner: Mpaa mpaa.wiki@gmail.com
Gerrit-Reviewer: Legoktm legoktm.wikipedia@gmail.com
Gerrit-Reviewer: Merlijn van Deen valhallasw@arctus.nl
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot