[Pywikipedia-l] SVN: [6222] trunk/pywikipedia/archivebot.py

misza13 at svn.wikimedia.org misza13 at svn.wikimedia.org
Mon Jan 5 17:27:25 UTC 2009


Revision: 6222
Author:   misza13
Date:     2009-01-05 17:27:25 +0000 (Mon, 05 Jan 2009)

Log Message:
-----------
Improved timestamp parsing

Modified Paths:
--------------
    trunk/pywikipedia/archivebot.py

Modified: trunk/pywikipedia/archivebot.py
===================================================================
--- trunk/pywikipedia/archivebot.py	2009-01-05 13:49:21 UTC (rev 6221)
+++ trunk/pywikipedia/archivebot.py	2009-01-05 17:27:25 UTC (rev 6222)
@@ -37,7 +37,7 @@
 import wikipedia, pagegenerators
 Site = wikipedia.getSite()
 
-import re, time, locale, traceback, string
+import os, re, time, locale, traceback, string
 
 try: #Get a constructor for the MD5 hash object
     import hashlib
@@ -178,9 +178,14 @@
 def txt2timestamp(txt, format):
     """Attempts to convert the timestamp 'txt' according to given 'format'.
     On success, returns the time tuple; on failure, returns None."""
+    #print txt, format
     try:
         return time.strptime(txt,format)
     except ValueError:
+        try:
+            return time.strptime(txt.encode('utf8'),format)
+        except:
+            pass
         return None
 
 
@@ -212,15 +217,15 @@
 # 16:36, 30 March 2008 (UTC)
 # huwiki
 # 2007. december 8., 13:42 (CET)
-        TM = re.search(r'(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d) \(.*?\)', line)
+        TM = re.search(r'(\d\d):(\d\d), (\d\d?) (\S+) (\d\d\d\d) \(.*?\)', line)
         if not TM:
-            TM = re.search(r'(\d\d):(\d\d), (\w+) (\d\d?), (\d\d\d\d) \(.*?\)', line)
+            TM = re.search(r'(\d\d):(\d\d), (\S+) (\d\d?), (\d\d\d\d) \(.*?\)', line)
         if not TM:
             TM = re.search(r'(\d{4})\. (\S+) (\d\d?)\., (\d\d:\d\d) \(.*?\)', line)
 # 18. apr 2006 kl.18:39 (UTC)
 # 4. nov 2006 kl. 20:46 (CET)
         if not TM:
-	        TM = re.search(r'(\d\d?)\. (\w+) (\d\d\d\d) kl\.\W*(\d\d):(\d\d) \(.*?\)', line)
+	        TM = re.search(r'(\d\d?)\. (\S+) (\d\d\d\d) kl\.\W*(\d\d):(\d\d) \(.*?\)', line)
         if TM:
 #            wikipedia.output(TM)
             TIME = txt2timestamp(TM.group(0),"%d. %b %Y kl. %H:%M (%Z)")
@@ -233,6 +238,8 @@
             if not TIME:
                 TIME = txt2timestamp(TM.group(0),"%H:%M, %d %b %Y (%Z)")
             if not TIME:
+                TIME = txt2timestamp(re.sub(' *\([^ ]+\) *','',TM.group(0)),"%H:%M, %d %b %Y")
+            if not TIME:
                 TIME = txt2timestamp(TM.group(0),"%H:%M, %b %d %Y (%Z)")
             if not TIME:
                 TIME = txt2timestamp(TM.group(0),"%H:%M, %b %d, %Y (%Z)")





More information about the Pywikipedia-l mailing list