http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10388
Revision: 10388
Author: xqt
Date: 2012-06-20 14:56:03 +0000 (Wed, 20 Jun 2012)
Log Message:
-----------
ignore minor version changes, see bug #3536604
Modified Paths:
--------------
trunk/pywikipedia/family.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py 2012-06-20 14:16:58 UTC (rev 10387)
+++ trunk/pywikipedia/family.py 2012-06-20 14:56:03 UTC (rev 10388)
@@ -4030,15 +4030,17 @@
# to not break family files.
return '1.20wmf4'
- def versionnumber(self, code):
+ def versionnumber(self, code, version=None):
"""Return an int identifying MediaWiki version.
Currently this is implemented as returning the minor version
number; i.e., 'X' in version '1.X.Y'
+ if version is given (e.g. from a mw page), extract that number
+
"""
R = re.compile(r"(\d+).(\d+)")
- M = R.search(self.version(code))
+ M = R.search(version or self.version(code))
if not M:
# Version string malformatted; assume it should have been 1.10
return 10
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2012-06-20 14:16:58 UTC (rev 10387)
+++ trunk/pywikipedia/wikipedia.py 2012-06-20 14:56:03 UTC (rev 10388)
@@ -4526,8 +4526,13 @@
m = p.match(version)
if m:
version = m.group(1)
- if version != self.site.version():
- output(u'WARNING: Family file %s contains version number %s, but it should be %s' % (self.site.family.name, self.site.version(), version))
+ # only warn operator when versionnumber has been changed
+ versionnumber = self.site.family.versionnumber
+ if version != self.site.version() and \
+ versionnumber(self.site.lang,
+ version=version) != versionnumber(self.site.lang):
+ output(u'WARNING: Family file %s contains version number %s, but it should be %s'
+ % (self.site.family.name, self.site.version(), version))
# Verify case
if self.site.nocapitalize:
@@ -4687,8 +4692,13 @@
m = p.match(header['general']['generator'])
if m:
version = m.group(1)
- if version != self.site.version():
- output(u'WARNING: Family file %s contains version number %s, but it should be %s' % (self.site.family.name, self.site.version(), version))
+ # only warn operator when versionnumber has been changed
+ versionnumber = self.site.family.versionnumber
+ if version != self.site.version() and \
+ versionnumber(self.site.lang,
+ version=version) != versionnumber(self.site.lang):
+ output(u'WARNING: Family file %s contains version number %s, but it should be %s'
+ % (self.site.family.name, self.site.version(), version))
# Verify case
if self.site.nocapitalize:
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10387
Revision: 10387
Author: xqt
Date: 2012-06-20 14:16:58 +0000 (Wed, 20 Jun 2012)
Log Message:
-----------
shorten file name title, (bug #3536400)
Modified Paths:
--------------
trunk/pywikipedia/flickrripper.py
Modified: trunk/pywikipedia/flickrripper.py
===================================================================
--- trunk/pywikipedia/flickrripper.py 2012-06-20 13:58:25 UTC (rev 10386)
+++ trunk/pywikipedia/flickrripper.py 2012-06-20 14:16:58 UTC (rev 10387)
@@ -140,40 +140,43 @@
return rawDescription.decode('utf-8')
-def getFilename(photoInfo=None, site=pywikibot.getSite(u'commons', u'commons'),
- project=u'Flickr'):
- ''' Build a good filename for the upload based on the username and the
+def getFilename(photoInfo=None, site=None, project=u'Flickr'):
+ """ Build a good filename for the upload based on the username and the
title. Prevents naming collisions.
- '''
+ """
+ if not site:
+ site = pywikibot.getSite(u'commons', u'commons')
username = photoInfo.find('photo').find('owner').attrib['username']
title = photoInfo.find('photo').find('title').text
if title:
- title = cleanUpTitle(title)
- else:
- title = u''
+ title = cleanUpTitle(title)
- if title == u'':
+ if not title:
+ #find the max length for a mw title
+ maxBytes = 240 - len(project.encode('utf-8')) \
+ - len(username.encode('utf-8'))
description = photoInfo.find('photo').find('description').text
if description:
- if len(description)>120:
- description = description[0 : 120]
- title = cleanUpTitle(description)
+ descBytes = len(description.encode('utf-8'))
+ if descBytes > maxBytes:
+ # maybe we cut more than needed, anyway we do it
+ items = max(0, len(description) - maxBytes + descBytes)
+ description = description[:items]
+ title = cleanUpTitle(description)
else:
title = u''
# Should probably have the id of the photo as last resort.
-
if pywikibot.Page(site, u'File:%s - %s - %s.jpg'
- % (title, project, username) ).exists():
+ % (title, project, username)).exists():
i = 1
while True:
- if (pywikibot.Page(site, u'File:%s - %s - %s (%s).jpg'
- % (title, project, username, str(i))).exists()):
- i = i + 1
+ if (pywikibot.Page(site, u'File:%s - %s - %s (%d).jpg'
+ % (title, project, username, i)).exists()):
+ i += 1
else:
- return u'%s - %s - %s (%s).jpg' % (title, project, username,
- str(i))
+ return u'%s - %s - %s (%d).jpg' % (title, project, username, i)
else:
return u'%s - %s - %s.jpg' % (title, project, username)
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10384
Revision: 10384
Author: xqt
Date: 2012-06-20 13:27:42 +0000 (Wed, 20 Jun 2012)
Log Message:
-----------
enable xml-dumps for solving double redirects
Modified Paths:
--------------
branches/rewrite/scripts/redirect.py
Modified: branches/rewrite/scripts/redirect.py
===================================================================
--- branches/rewrite/scripts/redirect.py 2012-06-20 13:11:29 UTC (rev 10383)
+++ branches/rewrite/scripts/redirect.py 2012-06-20 13:27:42 UTC (rev 10384)
@@ -17,17 +17,25 @@
and arguments can be:
+-xml Retrieve information from a local XML dump
+ (http://download.wikimedia.org). Argument can also be given as
+ "-xml:filename.xml". Cannot be used with -fullscan or -moves.
+
-fullscan Retrieve redirect pages from live wiki, not from a special page
-moves Use the page move log to find double-redirect candidates. Only
- works with action "double".
+ works with action "double", does not work with -xml.
+ NOTE: If neither of -xml -fullscan -moves is given, info will be
+ loaded from a special page of the live wiki.
+
-namespace:n Namespace to process. Can be given multiple times, for several
namespaces. If omitted, only the main (article) namespace is
treated.
-offset:n With -moves, the number of hours ago to start scanning moved
- pages. Otherwise, ignored.
+ pages. With -xml, the number of the redirect to restart with
+ (see progress). Otherwise, ignored.
-start:title The starting page title in each namespace. Page need not exist.
@@ -40,14 +48,6 @@
-always Don't prompt you for each replacement.
"""
-
-# XML not yet implemented: deleted help text follows
-##-xml Retrieve information from a local XML dump
-## (http://download.wikimedia.org). Argument can also be given as
-## "-xml:filename.xml". Cannot be used with -api or -moves.
-## If neither of -xml -api -moves is given, info will be loaded
-## from a special page of the live wiki.
-
#
# (C) Daniel Herding, 2004.
# (C) Purodha Blissenbach, 2009.
@@ -62,7 +62,7 @@
import pywikibot
from pywikibot import i18n
from pywikibot import config
-# import xmlreader
+from pywikibot import xmlreader
class RedirectGenerator:
@@ -82,77 +82,75 @@
self.api_number = number
self.api_step = step
-# note: rewrite branch does not yet support XML dumps, so this is commented out
-# until that support is added
-## def get_redirects_from_dump(self, alsoGetPageTitles=False):
-## '''
-## Load a local XML dump file, look at all pages which have the
-## redirect flag set, and find out where they're pointing at. Return
-## a dictionary where the redirect names are the keys and the redirect
-## targets are the values.
-## '''
-## xmlFilename = self.xmlFilename
-## redict = {}
-## # open xml dump and read page titles out of it
-## dump = xmlreader.XmlDump(xmlFilename)
-## redirR = self.site.redirectRegex()
-## readPagesCount = 0
-## if alsoGetPageTitles:
-## pageTitles = set()
-## for entry in dump.parse():
-## readPagesCount += 1
-## # always print status message after 10000 pages
-## if readPagesCount % 10000 == 0:
-## pywikibot.output(u'%i pages read...' % readPagesCount)
-## if len(self.namespaces) > 0:
-## if pywikibot.Page(self.site, entry.title).namespace() \
-## not in self.namespaces:
-## continue
-## if alsoGetPageTitles:
-## pageTitles.add(entry.title.replace(' ', '_'))
-##
-## m = redirR.match(entry.text)
-## if m:
-## target = m.group(1)
-## # There might be redirects to another wiki. Ignore these.
-## for code in self.site.family.langs.keys():
-## if target.startswith('%s:' % code) \
-## or target.startswith(':%s:' % code):
-## if code == self.site.language():
-## # link to our wiki, but with the lang prefix
-## target = target[(len(code)+1):]
-## if target.startswith(':'):
-## target = target[1:]
-## else:
-## pywikibot.output(
-## u'NOTE: Ignoring %s which is a redirect to %s:'
-## % (entry.title, code))
-## target = None
-## break
-## # if the redirect does not link to another wiki
-## if target:
-## source = entry.title.replace(' ', '_')
-## target = target.replace(' ', '_')
-## # remove leading and trailing whitespace
-## target = target.strip('_')
-## # capitalize the first letter
-## if not pywikibot.getSite().nocapitalize:
-## source = source[:1].upper() + source[1:]
-## target = target[:1].upper() + target[1:]
-## if '#' in target:
-## target = target[:target.index('#')].rstrip("_")
-## if '|' in target:
-## pywikibot.output(
-## u'HINT: %s is a redirect with a pipelink.'
-## % entry.title)
-## target = target[:target.index('|')].rstrip("_")
-## if target: # in case preceding steps left nothing
-## redict[source] = target
-## if alsoGetPageTitles:
-## return redict, pageTitles
-## else:
-## return redict
-##
+ def get_redirects_from_dump(self, alsoGetPageTitles=False):
+ '''
+ Load a local XML dump file, look at all pages which have the
+ redirect flag set, and find out where they're pointing at. Return
+ a dictionary where the redirect names are the keys and the redirect
+ targets are the values.
+ '''
+ xmlFilename = self.xmlFilename
+ redict = {}
+ # open xml dump and read page titles out of it
+ dump = xmlreader.XmlDump(xmlFilename)
+ redirR = self.site.redirectRegex()
+ readPagesCount = 0
+ if alsoGetPageTitles:
+ pageTitles = set()
+ for entry in dump.parse():
+ readPagesCount += 1
+ # always print status message after 10000 pages
+ if readPagesCount % 10000 == 0:
+ pywikibot.output(u'%i pages read...' % readPagesCount)
+ if len(self.namespaces) > 0:
+ if pywikibot.Page(self.site, entry.title).namespace() \
+ not in self.namespaces:
+ continue
+ if alsoGetPageTitles:
+ pageTitles.add(entry.title.replace(' ', '_'))
+
+ m = redirR.match(entry.text)
+ if m:
+ target = m.group(1)
+ # There might be redirects to another wiki. Ignore these.
+ for code in self.site.family.langs.keys():
+ if target.startswith('%s:' % code) \
+ or target.startswith(':%s:' % code):
+ if code == self.site.language():
+ # link to our wiki, but with the lang prefix
+ target = target[(len(code)+1):]
+ if target.startswith(':'):
+ target = target[1:]
+ else:
+ pywikibot.output(
+ u'NOTE: Ignoring %s which is a redirect to %s:'
+ % (entry.title, code))
+ target = None
+ break
+ # if the redirect does not link to another wiki
+ if target:
+ source = entry.title.replace(' ', '_')
+ target = target.replace(' ', '_')
+ # remove leading and trailing whitespace
+ target = target.strip('_')
+ # capitalize the first letter
+ if not pywikibot.getSite().nocapitalize:
+ source = source[:1].upper() + source[1:]
+ target = target[:1].upper() + target[1:]
+ if '#' in target:
+ target = target[:target.index('#')].rstrip("_")
+ if '|' in target:
+ pywikibot.output(
+ u'HINT: %s is a redirect with a pipelink.'
+ % entry.title)
+ target = target[:target.index('|')].rstrip("_")
+ if target: # in case preceding steps left nothing
+ redict[source] = target
+ if alsoGetPageTitles:
+ return redict, pageTitles
+ else:
+ return redict
+
def get_redirect_pages_via_api(self):
"""Return generator that yields
Pages that are redirects.
@@ -299,23 +297,22 @@
count += 1
if count >= self.api_number:
break
-
+ elif self.xmlFilename:
+ redict = self.get_redirects_from_dump()
+ num = 0
+ for (key, value) in redict.iteritems():
+ num += 1
+ # check if the value - that is, the redirect target - is a
+ # redirect as well
+ if num > self.offset and value in redict:
+ yield key
+ pywikibot.output(u'\nChecking redirect %i of %i...'
+ % (num + 1, len(redict)))
else:
# retrieve information from double redirect special page
pywikibot.output(u'Retrieving special page...')
for redir_name in self.site.double_redirects():
yield redir_name.title()
-## else:
-## redict = self.get_redirects_from_dump()
-## num = 0
-## for (key, value) in redict.iteritems():
-## num += 1
-## # check if the value - that is, the redirect target - is a
-## # redirect as well
-## if num > self.offset and value in redict:
-## yield key
-## pywikibot.output(u'\nChecking redirect %i of %i...'
-## % (num + 1, len(redict)))
def get_moved_pages_redirects(self):
'''generate redirects to recently-moved pages'''
@@ -693,10 +690,8 @@
else:
pywikibot.output(u'Unknown argument: %s' % arg)
- if xmlFilename:
- pywikibot.error(u"Sorry, xmlreader is not yet implemented in rewrite")
- elif not action: # or (xmlFilename and moved_pages)
- # or (api and xmlFilename):
+ if not action or (xmlFilename and moved_pages) \
+ or (fullscan and xmlFilename):
pywikibot.showHelp('redirect')
else:
gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages,