http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11440
Revision: 11440
Author: amir
Date: 2013-04-23 16:02:18 +0000 (Tue, 23 Apr 2013)
Log Message:
-----------
fixing bug #3610818 in a way that doesn't cause bug #3455789. It's not very good coding though
Modified Paths:
--------------
trunk/pywikipedia/pywikibot/textlib.py
Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py 2013-04-23 15:20:48 UTC (rev 11439)
+++ trunk/pywikipedia/pywikibot/textlib.py 2013-04-23 16:02:18 UTC (rev 11440)
@@ -844,7 +844,9 @@
# Note: While allowing dots inside URLs, MediaWiki will regard
# dots at the end of the URL as not part of that URL.
# The same applies to comma, colon and some other characters.
- notAtEnd = '\]\s\.:;,<>"\|'
+ notAtEnd = '\]\s\.:;,<>"\|\)'
+ #This is specially set for brackted link
+ notAtEndb = '\]\s\.:;,<>"\|'
# So characters inside the URL can be anything except whitespace,
# closing squared brackets, quotation marks, greater than and less
# than, and the last character also can't be parenthesis or another
@@ -857,11 +859,15 @@
regex = r'(?P<url>http[s]?://[^%(notInside)s]*?[^%(notAtEnd)s]' \
r'(?=[%(notAtEnd)s]*\'\')|http[s]?://[^%(notInside)s]*' \
r'[^%(notAtEnd)s])' % {'notInside': notInside, 'notAtEnd': notAtEnd}
-
+ regexb = r'(?P<url>http[s]?://[^%(notInside)s]*?[^%(notAtEnd)s]' \
+ r'(?=[%(notAtEnd)s]*\'\')|http[s]?://[^%(notInside)s]*' \
+ r'[^%(notAtEnd)s])' % {'notInside': notInside, 'notAtEnd': notAtEndb}
if withoutBracketed:
regex = r'(?<!\[)' + regex
elif onlyBracketed:
- regex = r'\[' + regex
+ regex = r'\[' + regexb
+ else:
+ regex=r'(?:(?<!\[)'+ regex+r'|\['+regexb=')'
linkR = re.compile(regex)
return linkR
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11436
Revision: 11436
Author: drtrigon
Date: 2013-04-22 22:23:48 +0000 (Mon, 22 Apr 2013)
Log Message:
-----------
bug fix; do NOT shutdown logger in case of additional/accidential output
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2013-04-22 20:57:35 UTC (rev 11435)
+++ trunk/pywikipedia/wikipedia.py 2013-04-22 22:23:48 UTC (rev 11436)
@@ -9539,7 +9539,8 @@
not slow down other bots any more.
"""
get_throttle.drop()
- logging.shutdown()
+ logger.flush()
+ #logging.shutdown()
def _flush():
"""Wait for the page-putter to flush its queue.
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11434
Revision: 11434
Author: drtrigon
Date: 2013-04-22 18:57:15 +0000 (Mon, 22 Apr 2013)
Log Message:
-----------
improvement; adopted wikidata template page format to #switch syntax
Modified Paths:
--------------
trunk/pywikipedia/subster.py
Modified: trunk/pywikipedia/subster.py
===================================================================
--- trunk/pywikipedia/subster.py 2013-04-22 18:40:18 UTC (rev 11433)
+++ trunk/pywikipedia/subster.py 2013-04-22 18:57:15 UTC (rev 11434)
@@ -528,14 +528,14 @@
def data_convertContent(self, substed_content):
"""Converts the substed content to Wikidata format in order to save.
- Template page format:
+ Template page format (adopted from #switch):
<pre>
| key1 = value1
| key2 = value2
...
</pre>
- (1 line of wiki text is converted to 1 claim/statement, the lines
- have to be embedded into pre-tags and start with '|')
+ every entry has to start with a '|' and contain a '=', the entries
+ have to be embedded into pre-tags (entries may share the same line)
@param substed_content: New/Changed content (including tags).
@type substed_content: string
@@ -545,14 +545,10 @@
# DRTRIGON-130: convert talk page result to wikidata(base)
data = u'\n'.join(re.findall('<pre>(.*?)</pre>', substed_content,
re.S | re.I))
+ data = self.get_var_regex('.*?', '(.*?)').sub('\g<1>', data)
res = {}
- for line in data.splitlines():
- #line = self.get_var_regex('(.*?)', '(.*?)').findall(line)
- line = self.get_var_regex('.*?', '(.*?)').sub('\g<1>', line)
- line = line.strip()
- if (not line) or (line[0] != u'|'):
- continue
- line = line.lstrip(u'|').split(u'=', 1)
+ for line in data.split(u'|'):
+ line = line.strip().split(u'=', 1)
if len(line) != 2:
continue
res[line[0].strip()] = line[1].strip()
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11433
Revision: 11433
Author: drtrigon
Date: 2013-04-22 18:40:18 +0000 (Mon, 22 Apr 2013)
Log Message:
-----------
bug fix; wikidata recognition for unchanged data
improvement; wikidata template page format further refined
Modified Paths:
--------------
trunk/pywikipedia/subster.py
Modified: trunk/pywikipedia/subster.py
===================================================================
--- trunk/pywikipedia/subster.py 2013-04-22 18:10:03 UTC (rev 11432)
+++ trunk/pywikipedia/subster.py 2013-04-22 18:40:18 UTC (rev 11433)
@@ -527,24 +527,35 @@
def data_convertContent(self, substed_content):
"""Converts the substed content to Wikidata format in order to save.
- (1 line of wiki text is converted to 1 claim/statement)
- @param substed_content: New content (with tags).
+ Template page format:
+ <pre>
+ | key1 = value1
+ | key2 = value2
+ ...
+ </pre>
+ (1 line of wiki text is converted to 1 claim/statement, the lines
+ have to be embedded into pre-tags and start with '|')
+
+ @param substed_content: New/Changed content (including tags).
@type substed_content: string
+
+ Returns the extracted and converted data.
"""
# DRTRIGON-130: convert talk page result to wikidata(base)
- # TODO: consider format; every line starting with "|" is data
- # TODO: combine with 'outputContentDiff' in order to update changed only
+ data = u'\n'.join(re.findall('<pre>(.*?)</pre>', substed_content,
+ re.S | re.I))
res = {}
- for line in substed_content.splitlines():
- #data = self.get_var_regex('(.*?)', '(.*?)').findall(line)
- data = self.get_var_regex('.*?', '(.*?)').sub('\g<1>', line)
- #if not data:
- if data == line:
+ for line in data.splitlines():
+ #line = self.get_var_regex('(.*?)', '(.*?)').findall(line)
+ line = self.get_var_regex('.*?', '(.*?)').sub('\g<1>', line)
+ line = line.strip()
+ if (not line) or (line[0] != u'|'):
continue
- data = data.lstrip(u'|')
- key, value = data.split(u'=')
- res[key.strip()] = value.strip()
+ line = line.lstrip(u'|').split(u'=', 1)
+ if len(line) != 2:
+ continue
+ res[line[0].strip()] = line[1].strip()
return res
@@ -562,14 +573,14 @@
datapage = pywikibot.DataPage(self.site, page.title())
links = datapage.searchentities(u'%s:%s' % (self._bot_config['BotName'], datapage.title().split(u':')[1]))
for element in links:
- propid = self._bot_config['data_PropertyId']
+ propid = int(self._bot_config['data_PropertyId'])
el = element[u'aliases'][0].split(u':')
item = el[2]
if item not in data:
pywikibot.output(u'Value "%s" not found.' % (item,))
data[item] = u'%s: N/A' % self._bot_config['BotName']
if len(el) > 3:
- propid = el[3]
+ propid = int(el[3])
dataoutpage = pywikibot.DataPage(self.site, element['id'])
@@ -579,8 +590,8 @@
claim = [ claim for claim in buf[u'claims'] if (claim['m'][1] == propid) ]
# TODO: does this check (if) work with multiple claims per property?
if (not claim) or (claim[0]['m'][3] != data[item]):
- pywikibot.output(u'%s in %s <--- %s = %s' %\
- (element[u'aliases'][0], dataoutpage.title(asLink=True), item, data[item]))
+ pywikibot.output(u'%s in %s changed to "%s"' %\
+ (element[u'aliases'][0], dataoutpage.title(asLink=True), data[item]))
dataoutpage.editclaim(u'p%s' % propid, data[item],
refs={"p%s" % propid:
[{"snaktype": "value",