[Pywikipedia-l] SVN: [6222] trunk/pywikipedia/archivebot.py
misza13 at svn.wikimedia.org
misza13 at svn.wikimedia.org
Mon Jan 5 17:27:25 UTC 2009
Revision: 6222
Author: misza13
Date: 2009-01-05 17:27:25 +0000 (Mon, 05 Jan 2009)
Log Message:
-----------
Improved timestamp parsing
Modified Paths:
--------------
trunk/pywikipedia/archivebot.py
Modified: trunk/pywikipedia/archivebot.py
===================================================================
--- trunk/pywikipedia/archivebot.py 2009-01-05 13:49:21 UTC (rev 6221)
+++ trunk/pywikipedia/archivebot.py 2009-01-05 17:27:25 UTC (rev 6222)
@@ -37,7 +37,7 @@
import wikipedia, pagegenerators
Site = wikipedia.getSite()
-import re, time, locale, traceback, string
+import os, re, time, locale, traceback, string
try: #Get a constructor for the MD5 hash object
import hashlib
@@ -178,9 +178,14 @@
def txt2timestamp(txt, format):
"""Attempts to convert the timestamp 'txt' according to given 'format'.
On success, returns the time tuple; on failure, returns None."""
+ #print txt, format
try:
return time.strptime(txt,format)
except ValueError:
+ try:
+ return time.strptime(txt.encode('utf8'),format)
+ except:
+ pass
return None
@@ -212,15 +217,15 @@
# 16:36, 30 March 2008 (UTC)
# huwiki
# 2007. december 8., 13:42 (CET)
- TM = re.search(r'(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d) \(.*?\)', line)
+ TM = re.search(r'(\d\d):(\d\d), (\d\d?) (\S+) (\d\d\d\d) \(.*?\)', line)
if not TM:
- TM = re.search(r'(\d\d):(\d\d), (\w+) (\d\d?), (\d\d\d\d) \(.*?\)', line)
+ TM = re.search(r'(\d\d):(\d\d), (\S+) (\d\d?), (\d\d\d\d) \(.*?\)', line)
if not TM:
TM = re.search(r'(\d{4})\. (\S+) (\d\d?)\., (\d\d:\d\d) \(.*?\)', line)
# 18. apr 2006 kl.18:39 (UTC)
# 4. nov 2006 kl. 20:46 (CET)
if not TM:
- TM = re.search(r'(\d\d?)\. (\w+) (\d\d\d\d) kl\.\W*(\d\d):(\d\d) \(.*?\)', line)
+ TM = re.search(r'(\d\d?)\. (\S+) (\d\d\d\d) kl\.\W*(\d\d):(\d\d) \(.*?\)', line)
if TM:
# wikipedia.output(TM)
TIME = txt2timestamp(TM.group(0),"%d. %b %Y kl. %H:%M (%Z)")
@@ -233,6 +238,8 @@
if not TIME:
TIME = txt2timestamp(TM.group(0),"%H:%M, %d %b %Y (%Z)")
if not TIME:
+ TIME = txt2timestamp(re.sub(' *\([^ ]+\) *','',TM.group(0)),"%H:%M, %d %b %Y")
+ if not TIME:
TIME = txt2timestamp(TM.group(0),"%H:%M, %b %d %Y (%Z)")
if not TIME:
TIME = txt2timestamp(TM.group(0),"%H:%M, %b %d, %Y (%Z)")
More information about the Pywikipedia-l
mailing list