SVN: [11433] trunk/pywikipedia/subster.py - Pywikipedia-svn

22 Apr 2013

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11433
Revision: 11433
Author:   drtrigon
Date:     2013-04-22 18:40:18 +0000 (Mon, 22 Apr 2013)
Log Message:
-----------
bug fix; wikidata recognition for unchanged data
improvement; wikidata template page format further refined
Modified Paths:
--------------
    trunk/pywikipedia/subster.py
Modified: trunk/pywikipedia/subster.py
===================================================================

--- trunk/pywikipedia/subster.py	2013-04-22 18:10:03 UTC (rev 11432)
+++ trunk/pywikipedia/subster.py	2013-04-22 18:40:18 UTC (rev 11433)
@@ -527,24 +527,35 @@
def data_convertContent(self, substed_content):
         """Converts the substed content to Wikidata format in order to save.
-           (1 line of wiki text is converted to 1 claim/statement)
-           @param substed_content: New content (with tags).
+           Template page format:
+             <pre>
+             | key1 = value1
+             | key2 = value2
+             ...
+             </pre>
+           (1 line of wiki text is converted to 1 claim/statement, the lines
+           have to be embedded into pre-tags and start with '|')
+
+           @param substed_content: New/Changed content (including tags).
            @type  substed_content: string
+           
+           Returns the extracted and converted data.
         """
         # DRTRIGON-130: convert talk page result to wikidata(base)
-        # TODO: consider format; every line starting with "|" is data
-        # TODO: combine with 'outputContentDiff' in order to update changed only
+        data = u'\n'.join(re.findall('<pre>(.*?)</pre>', substed_content, 
+                                     re.S | re.I))
         res = {}
-        for line in substed_content.splitlines():
-            #data = self.get_var_regex('(.*?)', '(.*?)').findall(line)
-            data = self.get_var_regex('.*?', '(.*?)').sub('\g<1>', line)
-            #if not data:
-            if data == line:
+        for line in data.splitlines():
+            #line = self.get_var_regex('(.*?)', '(.*?)').findall(line)
+            line = self.get_var_regex('.*?', '(.*?)').sub('\g<1>', line)
+            line = line.strip()
+            if (not line) or (line[0] != u'|'):
                 continue
-            data = data.lstrip(u'|')
-            key, value = data.split(u'=')
-            res[key.strip()] = value.strip()
+            line = line.lstrip(u'|').split(u'=', 1)
+            if len(line) != 2:
+                continue
+            res[line[0].strip()] = line[1].strip()
return res
@@ -562,14 +573,14 @@
         datapage = pywikibot.DataPage(self.site, page.title())
         links = datapage.searchentities(u'%s:%s' % (self._bot_config['BotName'], datapage.title().split(u':')[1]))
         for element in links:
-            propid = self._bot_config['data_PropertyId']
+            propid = int(self._bot_config['data_PropertyId'])
             el = element[u'aliases'][0].split(u':')
             item = el[2]
             if item not in data:
                 pywikibot.output(u'Value "%s" not found.' % (item,))
                 data[item] = u'%s: N/A' % self._bot_config['BotName']
             if len(el) > 3:
-                propid = el[3]
+                propid = int(el[3])
dataoutpage = pywikibot.DataPage(self.site, element['id'])
@@ -579,8 +590,8 @@
             claim = [ claim for claim in buf[u'claims'] if (claim['m'][1] == propid) ]
             # TODO: does this check (if) work with multiple claims per property?
             if (not claim) or (claim[0]['m'][3] != data[item]):
-                pywikibot.output(u'%s in %s <--- %s = %s' %\
-                    (element[u'aliases'][0], dataoutpage.title(asLink=True), item, data[item]))
+                pywikibot.output(u'%s in %s changed to "%s"' %\
+                    (element[u'aliases'][0], dataoutpage.title(asLink=True), data[item]))
                 dataoutpage.editclaim(u'p%s' % propid, data[item],
                                       refs={"p%s" % propid:
                                           [{"snaktype":  "value",