Revision: 8765
Author: purodha
Date: 2010-12-07 15:18:55 +0000 (Tue, 07 Dec 2010)
Log Message:
-----------
Add -hintsonly option to interwiki.py making the 1st existing hinted page the start page if none supplied.
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2010-12-07 12:50:59 UTC (rev 8764)
+++ trunk/pywikipedia/interwiki.py 2010-12-07 15:18:55 UTC (rev 8765)
@@ -110,6 +110,18 @@
could be used for further explainings of the bot action.
This will only be used in non-autonomous mode.
+ -hintsonly The bot does not ask for a page to work on, even if none of
+ the above page sources was specified. This will make the
+ first existing page of -hint or -hinfile slip in as the start
+ page, determining properties like namespace, disambiguation
+ state, and so on. When no existing page is found in the
+ hints, the bot does nothing.
+ Hitting return without input on the "Which page to check:"
+ prompt has the same effect as using -hintsonly.
+ Options like -back, -same or -wiktionary are in effect only
+ after a page has been found to work on.
+ (note: without ending colon)
+
These arguments are useful to provide hints to the bot:
-hint: used as -hint:de:Anweisung to give the robot a hint
@@ -952,19 +964,21 @@
this Object.
"""
- def __init__(self, originPage, hints = None):
+ def __init__(self, originPage = None, hints = None):
"""Constructor. Takes as arguments the Page on the home wiki
plus optionally a list of hints for translation"""
if globalvar.contentsondisk:
- originPage = StoredPage(originPage)
+ if originPage:
+ originPage = StoredPage(originPage)
# Remember the "origin page"
self.originPage = originPage
# todo is a list of all pages that still need to be analyzed.
# Mark the origin page as todo.
self.todo = PageTree()
- self.todo.add(originPage)
+ if originPage:
+ self.todo.add(originPage)
# done is a list of all pages that have been analyzed and that
# are known to belong to this subject.
@@ -973,7 +987,10 @@
# pages are values. It stores where we found each page.
# As we haven't yet found a page that links to the origin page, we
# start with an empty list for it.
- self.foundIn = {self.originPage:[]}
+ if originPage:
+ self.foundIn = {self.originPage:[]}
+ else:
+ self.foundIn = {}
# This is a list of all pages that are currently scheduled for
# download.
self.pending = PageTree()
@@ -1024,23 +1041,25 @@
"""
for tree in [self.done, self.pending, self.todo]:
for page in tree.filter(site):
- if page.namespace() == self.originPage.namespace():
+ # -hintsonly: before we have an origin page, any namespace will do.
+ if self.originPage and page.namespace() == self.originPage.namespace():
if page.exists() and not page.isRedirectPage() and not page.isCategoryRedirect():
return page
return None
def translate(self, hints = None, keephintedsites = False):
"""Add the given translation hints to the todo list"""
- if globalvar.same:
+ if globalvar.same and self.originPage:
if hints:
- pages = titletranslate.translate(self.originPage, hints = hints + ['all:'], auto = globalvar.auto, removebrackets
-= globalvar.hintnobracket)
+ pages = titletranslate.translate(self.originPage, hints = hints + ['all:'],
+ auto = globalvar.auto, removebrackets = globalvar.hintnobracket)
else:
- pages = titletranslate.translate(self.originPage, hints = ['all:'], auto = globalvar.auto, removebrackets
-= globalvar.hintnobracket)
+ pages = titletranslate.translate(self.originPage, hints = ['all:'],
+ auto = globalvar.auto, removebrackets = globalvar.hintnobracket)
else:
- pages = titletranslate.translate(self.originPage, hints = hints, auto = globalvar.auto, removebrackets
-= globalvar.hintnobracket)
+ pages = titletranslate.translate(self.originPage, hints = hints,
+ auto = globalvar.auto, removebrackets = globalvar.hintnobracket,
+ site = pywikibot.getSite() )
for page in pages:
if globalvar.contentsondisk:
page = StoredPage(page)
@@ -1100,7 +1119,7 @@
"""
if self.forcedStop:
return False
- if globalvar.nobackonly:
+ if globalvar.nobackonly and originPage: # cannot check backlink before we have an origin page
if page == self.originPage:
try:
pywikibot.output(u"%s has a backlink from %s."
@@ -1138,7 +1157,7 @@
if linkedPage in self.foundIn:
# We have seen this page before, don't ask again.
return False
- elif self.originPage.namespace() != linkedPage.namespace():
+ elif self.originPage and self.originPage.namespace() != linkedPage.namespace():
# Allow for a mapping between different namespaces
crossFrom = self.originPage.site().family.crossnamespace.get(self.originPage.namespace(), {})
crossTo = crossFrom.get(self.originPage.site().language(), crossFrom.get('_default', {}))
@@ -1181,10 +1200,11 @@
return True
else:
# same namespaces, no problem
+ # or no origin page yet, also no problem
return False
def wiktionaryMismatch(self, page):
- if globalvar.same=='wiktionary':
+ if self.originPage and globalvar.same=='wiktionary':
if page.title().lower() != self.originPage.title().lower():
pywikibot.output(u"NOTE: Ignoring %s for %s in wiktionary mode" % (page.title(asLink=True), self.originPage.title(asLink=True)))
return True
@@ -1207,6 +1227,8 @@
alternativePage is either None, or a page that the user has
chosen to use instead of the given page.
"""
+ if not self.originPage:
+ return (False, None) # any page matches until we have an origin page
if globalvar.autonomous:
if self.originPage.isDisambig() and not page.isDisambig():
pywikibot.output(u"NOTE: Ignoring link from disambiguation page %s to non-disambiguation %s"
@@ -1296,8 +1318,8 @@
elif not newhint:
break
else:
- pages = titletranslate.translate(self.originPage, hints = [newhint], auto = globalvar.auto, removebrackets
-= globalvar.hintnobracket)
+ pages = titletranslate.translate(self.originPage, hints = [newhint],
+ auto = globalvar.auto, removebrackets = globalvar.hintnobracket)
for page in pages:
self.addIfNew(page, counter, None)
if globalvar.hintsareright:
@@ -1323,9 +1345,10 @@
if globalvar.skipauto:
dictName, year = page.autoFormat()
if dictName is not None:
- pywikibot.output(u'WARNING: %s:%s relates to %s:%s, which is an auto entry %s(%s)'
- % (self.originPage.site().language(), self.originPage.title(),
- page.site().language(),page.title(),dictName,year))
+ if self.originPage:
+ pywikibot.output(u'WARNING: %s:%s relates to %s:%s, which is an auto entry %s(%s)'
+ % (self.originPage.site().language(), self.originPage.title(),
+ page.site().language(),page.title(),dictName,year))
# Abort processing if the bot is running in autonomous mode.
if globalvar.autonomous:
@@ -1371,7 +1394,8 @@
if not globalvar.quiet or pywikibot.verbose:
pywikibot.output(u"NOTE: %s is %sredirect to %s"
% (page.aslink(True), redir, redirectTargetPage.aslink(True)))
- if page == self.originPage:
+ if self.originPage is None or page == self.originPage:
+ # the 1st existig page becomes the origin page, if none was supplied
if globalvar.initialredirect:
if globalvar.contentsondisk:
redirectTargetPage = StoredPage(redirectTargetPage)
@@ -1396,8 +1420,8 @@
if self.addIfNew(redirectTargetPage, counter, page):
if config.interwiki_shownew or pywikibot.verbose:
pywikibot.output(u"%s: %s gives new %sredirect %s"
- % (self.originPage.title(asLink=True), page.aslink(True),
- redir, redirectTargetPage.aslink(True)))
+ % (self.originPage.title(asLink=True), page.aslink(True),
+ redir, redirectTargetPage.aslink(True)))
continue
# must be behind the page.isRedirectPage() part
@@ -1410,7 +1434,8 @@
for site, count in self.todo.siteCounts():
counter.minus(site, count)
self.todo = PageTree()
- self.done = PageTree()
+ self.done = PageTree()
+ self.originPage = None
continue
elif page.section():
@@ -1419,6 +1444,9 @@
continue
# Page exists, isnt a redirect, and is a plain link (no section)
+ if self.originPage is None:
+ # the 1st existig page becomes the origin page, if none was supplied
+ self.originPage = page
try:
iw = page.interwiki()
except pywikibot.NoSuchSite:
@@ -1665,10 +1693,11 @@
if self.forcedStop: # autonomous with problem
pywikibot.output(u"======Aborted processing %s======" % self.originPage.aslink(True))
return
- if self.originPage.isRedirectPage():
- return
- if self.originPage.isCategoryRedirect():
- return
+ if self.originPage:
+ if self.originPage.isRedirectPage():
+ return
+ if self.originPage.isCategoryRedirect():
+ return
if not self.untranslated and globalvar.untranslatedonly:
return
# The following check is not always correct and thus disabled.
@@ -1677,14 +1706,18 @@
# if len(self.done) == 1:
# # No interwiki at all
# return
- pywikibot.output(u"======Post-processing %s======" % self.originPage.aslink(True))
+ if self.originPage:
+ pywikibot.output(u"======Post-processing %s======" % self.originPage.aslink(True))
# Assemble list of accepted interwiki links
new = self.assemble()
if new is None: # User said give up
pywikibot.output(u"======Aborted processing %s======" % self.originPage.aslink(True))
return
+ if not len(new): # nothing else to do
+ return
# Make sure new contains every page link, including the page we are processing
+ # TODO: sould be move to assemble()
# replaceLinks will skip the site it's working on.
if self.originPage.site() not in new:
if not self.originPage.site().family.interwiki_forward: #TODO: make this possible as well.
@@ -2155,7 +2188,8 @@
else: mode = 'written'
f = codecs.open(dumpfn, mode[0], 'utf-8')
for subj in self.subjects:
- f.write(subj.originPage.aslink(None)+'\n')
+ if subj.originPage:
+ f.write(subj.originPage.aslink(None)+'\n')
f.close()
pywikibot.output(u'Dump %s (%s) %s.' % (site.lang, site.family.name, mode))
return dumpfn
@@ -2430,6 +2464,7 @@
def main():
singlePageTitle = []
+ opthintsonly = False
start = None
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
@@ -2485,6 +2520,8 @@
optRestore = not globalvar.restoreAll
elif arg == '-continue':
optContinue = True
+ elif arg == '-hintsonly':
+ opthintsonly = True
elif arg.startswith('-namespace:'):
try:
namespaces.append(int(arg[11:]))
@@ -2593,9 +2630,12 @@
readWarnfile(warnfile, bot)
else:
singlePageTitle = ' '.join(singlePageTitle)
- if not singlePageTitle:
- singlePageTitle = pywikibot.input(u'Which page to check:')
- singlePage = pywikibot.Page(pywikibot.getSite(), singlePageTitle)
+ if not singlePageTitle and not opthintsonly:
+ singlePageTitle = pywikibot.input(u'Which page to check:')
+ if singlePageTitle:
+ singlePage = pywikibot.Page(pywikibot.getSite(), singlePageTitle)
+ else:
+ singlePage = None
bot.add(singlePage, hints = globalvar.hints)
try:
Revision: 8764
Author: purodha
Date: 2010-12-07 12:50:59 +0000 (Tue, 07 Dec 2010)
Log Message:
-----------
Allow config.py to be verbose about where it got its data from. More comments.
Modified Paths:
--------------
trunk/pywikipedia/config.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2010-12-06 15:13:17 UTC (rev 8763)
+++ trunk/pywikipedia/config.py 2010-12-07 12:50:59 UTC (rev 8764)
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
#
# (C) Rob W.W. Hooft, 2003
+# Purodha Blissenbach (Modifier), 2010
+
#
# Distributed under the terms of the MIT license.
#
@@ -42,12 +44,17 @@
# sysopnames['wiktionary']['en'] = 'myEnglishUsername'
usernames = {}
sysopnames = {}
+
+# See section SOLVE_DISAMBIGUATION SETTINGS for details.
disambiguation_comment = {}
+# This is currently not used anywhere:
gdab_namespaces = {}
+# This is currently not used anywhere:
account_global = False
# Solve captchas in the webbrowser. Setting this to False will result in the
# exception CaptchaError being thrown if a captcha is encountered.
+#TODO: allow more flexibility, such as runtime choices, skipping, and postponing
solve_captcha = True
# Some sites will require password authentication to access the HTML pages at
@@ -460,6 +467,16 @@
# End of configuration section
# ============================
+
+# is config verbose?
+_verbose = False
+for _arg in __sys.argv[1:]:
+ if _arg == "-v" or _arg == "-verbose":
+ _verbose = True
+ break
+if _verbose:
+ print "Config.py"
+
# Get the names of all known families, and initialize
# with empty dictionaries
import wikipediatools as _wt
@@ -568,6 +585,14 @@
return path[len(base_dir) + len(os.path.sep) : ]
return path
+
+if _verbose:
+ print "- base_dir: ", base_dir
+
+# Exit message
+if _verbose:
+ print "- done."
+
#
# When called as main program, list all configuration variables
#
@@ -577,6 +602,12 @@
for _arg in __sys.argv[1:]:
if _arg == "modified":
_all = 0
+ elif _arg == "-v":
+ pass
+ elif _arg == "-verbose":
+ pass
+ elif _arg.startswith("-dir:"):
+ pass
else:
print "Unknown arg %s ignored"%_arg
_k = globals().keys()
Revision: 8763
Author: purodha
Date: 2010-12-06 15:13:17 +0000 (Mon, 06 Dec 2010)
Log Message:
-----------
Interwiki.py: Bugfix, -askhints could ask for hints on a nonexisting page and choke,
when preparing for page contents to be shown.
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2010-12-06 11:26:20 UTC (rev 8762)
+++ trunk/pywikipedia/interwiki.py 2010-12-06 15:13:17 UTC (rev 8763)
@@ -1276,6 +1276,7 @@
# Do not ask hints for pages that we don't work on anyway
return
if (self.untranslated or globalvar.askhints) and not self.hintsAsked \
+ and self.originPage and self.originPage.exists() \
and not self.originPage.isRedirectPage() and not self.originPage.isCategoryRedirect():
# Only once!
self.hintsAsked = True
Revision: 8762
Author: purodha
Date: 2010-12-06 11:26:20 +0000 (Mon, 06 Dec 2010)
Log Message:
-----------
Interwiki.py: Allow commons, incubator and other wikis forwarding their interlanguagelinks to wikipedia to act as hint sources.
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
trunk/pywikipedia/titletranslate.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2010-12-06 02:14:54 UTC (rev 8761)
+++ trunk/pywikipedia/interwiki.py 2010-12-06 11:26:20 UTC (rev 8762)
@@ -96,13 +96,15 @@
These arguments control miscellanous bot behaviour:
- -quiet: Use this option to get less output
+ -quiet Use this option to get less output
+ (note: without ending colon)
- -async: Put page on queue to be saved to wiki asynchronously. This
+ -async Put page on queue to be saved to wiki asynchronously. This
enables loading pages during saving throtteling and gives a
better performance.
NOTE: For post-processing it always assumes that saving the
the pages was sucessful.
+ (note: without ending colon)
-summary: Set an additional action summary message for the edit. This
could be used for further explainings of the bot action.
@@ -119,19 +121,29 @@
There are some special hints, trying a number of languages
at once:
- * all: All languages with at least ca. 100 articles.
- * 10: The 10 largest languages (sites with most
- articles). Analogous for any other natural
- number.
- * arab: All languages using the Arabic alphabet.
- * cyril: All languages that use the Cyrillic alphabet.
- * chinese: All Chinese dialects.
- * latin: All languages using the Latin script.
- * scand: All Scandinavian languages.
+ * all: All languages with at least ca. 100 articles.
+ * 10: The 10 largest languages (sites with most
+ articles). Analogous for any other natural
+ number.
+ * arab: All languages using the Arabic alphabet.
+ * cyril: All languages that use the Cyrillic alphabet.
+ * chinese: All Chinese dialects.
+ * latin: All languages using the Latin script.
+ * scand: All Scandinavian languages.
- Languages and groups having the same page title can be
- combined, as in -hint:5,scand,sr,pt:New_York
+ Names of families that forward their interlanguage links
+ to the wiki family being worked upon can be used, they are:
+ with -family=wikipedia only:
+ * commons: Interlanguage links of Mediawiki Commons.
+ * incubator: Links in pages on the Mediawiki Incubator.
+ * meta: Interlanguage links of named pages on Meta.
+ * species: Interlanguage links of the wikispecies wiki.
+ * strategy: Links in pages on Wikimedias strategy wiki.
+ * test: Take interwiki links from Test Wikipedia
+ Languages, groups and families having the same page title
+ can be combined, as -hint:5,scand,sr,pt,commons:New_York
+
-hintfile: similar to -hint, except that hints are taken from the given
file, enclosed in [[]] each, instead of the command line.
@@ -302,7 +314,7 @@
the "-restore" or "-continue" option, and finish all the subjects in that list.
After finishing the dump file will be deleted. To run the interwiki-bot on all
pages on a language, run it with option "-start:!", and if it takes so long
-ithat you have to break it off, use "-continue" next time.
+that you have to break it off, use "-continue" next time.
"""
#
@@ -1383,7 +1395,8 @@
if self.addIfNew(redirectTargetPage, counter, page):
if config.interwiki_shownew or pywikibot.verbose:
pywikibot.output(u"%s: %s gives new %sredirect %s"
- % (self.originPage.title(asLink=True), page.aslink(True), redir, redirectTargetPage.aslink(True)))
+ % (self.originPage.title(asLink=True), page.aslink(True),
+ redir, redirectTargetPage.aslink(True)))
continue
# must be behind the page.isRedirectPage() part
@@ -1528,6 +1541,9 @@
for page in self.done:
if page.exists() and not page.isRedirectPage() and not page.isCategoryRedirect():
site = page.site()
+ if site.family.interwiki_forward:
+ #TODO: allow these cases to be propagated!
+ continue # inhibit the forwarding families pages to be updated.
if site == self.originPage.site():
if page != self.originPage:
self.problem(u"Found link to %s" % page.aslink(True) )
@@ -1572,6 +1588,7 @@
pywikibot.output(u" (%d) Found link to %s in:" % (i, page2.aslink(True)))
self.whereReport(page2, indent = 8)
while True:
+ #TODO: allow answer to repeat previous or go back after a mistake
answer = pywikibot.input(u"Which variant should be used? (<number>, [n]one, [g]ive up) ").lower()
if answer:
if answer == 'g':
@@ -1602,6 +1619,7 @@
if acceptall:
answer = 'a'
else:
+ #TODO: allow answer to repeat previous or go back after a mistake
answer = pywikibot.inputChoice(u'What should be done?', ['accept', 'reject', 'give up', 'accept all'], ['a', 'r', 'g', 'l'], 'a')
if answer == 'l': # accept all
acceptall = True
@@ -1668,6 +1686,7 @@
# Make sure new contains every page link, including the page we are processing
# replaceLinks will skip the site it's working on.
if self.originPage.site() not in new:
+ if not self.originPage.site().family.interwiki_forward: #TODO: make this possible as well.
new[self.originPage.site()] = self.originPage
#self.replaceLinks(self.originPage, new, True, bot)
@@ -1761,8 +1780,7 @@
if diff > 30*24*60:
smallWikiAllowed = True
else:
- pywikibot.output(
-u'NOTE: number of edits are restricted at %s'
+ pywikibot.output( u'NOTE: number of edits are restricted at %s'
% page.site().sitename())
# if we have an account for this site
@@ -1871,6 +1889,12 @@
# Avoid adding an iw link back to itself
del new[page.site()]
+ # Do not add interwiki links to foreign families that page.site() does not forward to
+ for stmp in new.keys():
+ if stmp.family != page.site().family:
+ if stmp.family.name != page.site().family.interwiki_forward:
+ del new[stmp]
+
# Put interwiki links into a map
old={}
for page2 in interwikis:
Modified: trunk/pywikipedia/titletranslate.py
===================================================================
--- trunk/pywikipedia/titletranslate.py 2010-12-06 02:14:54 UTC (rev 8761)
+++ trunk/pywikipedia/titletranslate.py 2010-12-06 11:26:20 UTC (rev 8762)
@@ -81,10 +81,10 @@
for newcode in codes:
x = None
if newcode in family.langs.keys():
- if ( page is None ) or ( newcode != sitelang ):
+ if ( page is None ) or ( ( newcode != sitelang ) and ( not family.interwiki_forwarded_from ) ):
x = pywikibot.Page(pywikibot.getSite(fam=family, code=newcode), newname)
-# elif newcode in family.interwiki_forwarded_from:
-# x = pywikibot.Page(pywikibot.getSite(fam=newcode, code=newcode), newname)
+ elif newcode in family.interwiki_forwarded_from:
+ x = pywikibot.Page(pywikibot.getSite(fam=newcode, code=newcode), newname)
else:
if pywikibot.verbose:
pywikibot.output(u"Ignoring the unknown language code %s" % newcode)
Revision: 8761
Author: purodha
Date: 2010-12-06 02:14:54 +0000 (Mon, 06 Dec 2010)
Log Message:
-----------
Cosmetic: few glitches from description and comments removed, better formatting of some long source lines.
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2010-12-05 22:07:02 UTC (rev 8760)
+++ trunk/pywikipedia/interwiki.py 2010-12-06 02:14:54 UTC (rev 8761)
@@ -60,18 +60,6 @@
against the live wiki is using the warnfile.py
script.
- -quiet: Use this option to get less output
-
- -async: Put page on queue to be saved to wiki asynchronously. This
- enables loading pages during saving throtteling and gives a
- better performance.
- NOTE: For post-processing it always assumes that saving the
- the pages was sucessful.
-
- -summary: Set an additional action summary message for the edit. This
- could be used for further explainings of the bot action.
- This will only be used in non-autonomous mode.
-
Additionaly, these arguments can be used to restrict the bot to certain pages:
-namespace:n Number or name of namespace to process. The parameter can be
@@ -104,8 +92,22 @@
-lack: used as -lack:xx with xx a language code: only work on pages
without links to language xx. You can also add a number nn
like -lack:xx:nn, so that the bot only works on pages with
- at least n interwiki links (the default value for n is 1).
-
+ at least nn interwiki links (the default value for nn is 1).
+
+These arguments control miscellanous bot behaviour:
+
+ -quiet: Use this option to get less output
+
+ -async: Put page on queue to be saved to wiki asynchronously. This
+ enables loading pages during saving throtteling and gives a
+ better performance.
+ NOTE: For post-processing it always assumes that saving the
+ the pages was sucessful.
+
+ -summary: Set an additional action summary message for the edit. This
+ could be used for further explainings of the bot action.
+ This will only be used in non-autonomous mode.
+
These arguments are useful to provide hints to the bot:
-hint: used as -hint:de:Anweisung to give the robot a hint
@@ -206,7 +208,7 @@
-initialredirect work on its target if a redirect or category redirect is
entered on the command line or by a generator (note: without
- ending colon). It is recommended to use this option with
+ ending colon). It is recommended to use this option with the
-movelog pagegenerator.
-neverlink: used as -neverlink:xx where xx is a language code:
@@ -220,10 +222,10 @@
-ignorefile: similar to -ignore, except that the pages are taken from
the given file instead of the command line.
- -localright do not follow interwiki from other pages than the starting
- page. (Warning! Should be used very sparingly, only when
- you are sure you have first gotten the interwiki on the
- starting page exactly right).
+ -localright do not follow interwiki links from other pages than the
+ starting page. (Warning! Should be used very sparingly,
+ only when you are sure you have first gotten the interwiki
+ links on the starting page exactly right).
(note: without ending colon)
-hintsareright do not follow interwiki links to sites for which hints
@@ -248,23 +250,25 @@
-limittwo only update two pages - one in the local wiki (if logged-in)
and one in the top available one.
For example, if the local page has links to de and fr,
- this option will make sure that only local and de: (larger)
- site is updated. This option is useful to quickly set two
- way links without updating all of wiki's sites.
+ this option will make sure that only the local site and
+ the de: (larger) sites are updated. This option is useful
+ to quickly set two way links without updating all of
+ wiki families sites.
(note: without ending colon)
-whenneeded works like limittwo, but other languages are changed in the
following cases:
- * If there are no interwiki at all on the page
- * If an interwiki must be removed
- * If an interwiki must be changed and there has been a
- conflict for this page
+ * If there are no interwiki links at all on the page
+ * If an interwiki link must be removed
+ * If an interwiki link must be changed and there has been
+ a conflict for this page
Optionally, -whenneeded can be given an additional number
(for example -whenneeded:3), in which case other languages
will be changed if there are that number or more links to
change or add. (note: without ending colon)
The following arguments influence how many pages the bot works on at once:
+
-array: The number of pages the bot tries to be working on at once.
If the number of pages loaded is lower than this number,
a new set of pages is loaded from the starting wiki. The
@@ -297,8 +301,8 @@
to the interwiki-dumps subdirectory. The program will read it if invoked with
the "-restore" or "-continue" option, and finish all the subjects in that list.
After finishing the dump file will be deleted. To run the interwiki-bot on all
-pages on a language, run it with option "-start:!", and if it takes so long you
-have to break it off, use "-continue" next time.
+pages on a language, run it with option "-start:!", and if it takes so long
+ithat you have to break it off, use "-continue" next time.
"""
#
@@ -822,7 +826,7 @@
# to the original topic than pages found later on, after
# 3, 4, 5 or more interwiki hops.
- # Keeping this order is hence important to display ordered
+ # Keeping this order is hence important to display an ordered
# list of pages to the user when he'll be asked to resolve
# conflicts.
self.tree = {}
@@ -1130,17 +1134,24 @@
if linkedPage.namespace() in nsmatch:
return False
if globalvar.autonomous:
- pywikibot.output(u"NOTE: Ignoring link from page %s in namespace %i to page %s in namespace %i." % (linkingPage.aslink(True), linkingPage.namespace(), linkedPage.aslink(True), linkedPage.namespace()))
+ pywikibot.output(u"NOTE: Ignoring link from page %s in namespace %i to page %s in namespace %i."
+ % (linkingPage.aslink(True), linkingPage.namespace(),
+ linkedPage.aslink(True), linkedPage.namespace()))
# Fill up foundIn, so that we will not write this notice
self.foundIn[linkedPage] = [linkingPage]
return True
else:
preferredPage = self.getFoundInCorrectNamespace(linkedPage.site())
if preferredPage:
- pywikibot.output(u"NOTE: Ignoring link from page %s in namespace %i to page %s in namespace %i because page %s in the correct namespace has already been found." % (linkingPage.aslink(True), linkingPage.namespace(), linkedPage.aslink(True), linkedPage.namespace(), preferredPage.aslink(True)))
+ pywikibot.output(u"NOTE: Ignoring link from page %s in namespace %i to page %s in namespace %i because page %s in the correct namespace has already been found."
+ % (linkingPage.aslink(True), linkingPage.namespace(), linkedPage.aslink(True),
+ linkedPage.namespace(), preferredPage.aslink(True)))
return True
else:
- choice = pywikibot.inputChoice('WARNING: %s is in namespace %i, but %s is in namespace %i. Follow it anyway?' % (self.originPage.aslink(True), self.originPage.namespace(), linkedPage.aslink(True), linkedPage.namespace()), ['Yes', 'No', 'Add an alternative', 'give up'], ['y', 'n', 'a', 'g'])
+ choice = pywikibot.inputChoice('WARNING: %s is in namespace %i, but %s is in namespace %i. Follow it anyway?'
+ % (self.originPage.aslink(True), self.originPage.namespace(),
+ linkedPage.aslink(True), linkedPage.namespace()),
+ ['Yes', 'No', 'Add an alternative', 'give up'], ['y', 'n', 'a', 'g'])
if choice != 'y':
# Fill up foundIn, so that we will not ask again
self.foundIn[linkedPage] = [linkingPage]
@@ -1186,31 +1197,42 @@
"""
if globalvar.autonomous:
if self.originPage.isDisambig() and not page.isDisambig():
- pywikibot.output(u"NOTE: Ignoring link from disambiguation page %s to non-disambiguation %s" % (self.originPage.aslink(True), page.aslink(True)))
+ pywikibot.output(u"NOTE: Ignoring link from disambiguation page %s to non-disambiguation %s"
+ % (self.originPage.aslink(True), page.aslink(True)))
return (True, None)
elif not self.originPage.isDisambig() and page.isDisambig():
- pywikibot.output(u"NOTE: Ignoring link from non-disambiguation page %s to disambiguation %s" % (self.originPage.aslink(True), page.aslink(True)))
+ pywikibot.output(u"NOTE: Ignoring link from non-disambiguation page %s to disambiguation %s"
+ % (self.originPage.aslink(True), page.aslink(True)))
return (True, None)
else:
choice = 'y'
if self.originPage.isDisambig() and not page.isDisambig():
disambig = self.getFoundDisambig(page.site())
if disambig:
- pywikibot.output(u"NOTE: Ignoring non-disambiguation page %s for %s because disambiguation page %s has already been found." % (page.aslink(True), self.originPage.aslink(True), disambig.aslink(True)))
+ pywikibot.output(u"NOTE: Ignoring non-disambiguation page %s for %s because disambiguation page %s has already been found."
+ % (page.aslink(True), self.originPage.aslink(True), disambig.aslink(True)))
return (True, None)
else:
- choice = pywikibot.inputChoice(u'WARNING: %s is a disambiguation page, but %s doesn\'t seem to be one. Follow it anyway?' % (self.originPage.aslink(True), page.aslink(True)), ['Yes', 'No', 'Add an alternative', 'Give up'], ['y', 'n', 'a', 'g'])
+ choice = pywikibot.inputChoice(u'WARNING: %s is a disambiguation page, but %s doesn\'t seem to be one. Follow it anyway?'
+ % (self.originPage.aslink(True), page.aslink(True)),
+ ['Yes', 'No', 'Add an alternative', 'Give up'],
+ ['y', 'n', 'a', 'g'])
elif not self.originPage.isDisambig() and page.isDisambig():
nondisambig = self.getFoundNonDisambig(page.site())
if nondisambig:
- pywikibot.output(u"NOTE: Ignoring disambiguation page %s for %s because non-disambiguation page %s has already been found." % (page.aslink(True), self.originPage.aslink(True), nondisambig.aslink(True)))
+ pywikibot.output(u"NOTE: Ignoring disambiguation page %s for %s because non-disambiguation page %s has already been found."
+ % (page.aslink(True), self.originPage.aslink(True), nondisambig.aslink(True)))
return (True, None)
else:
- choice = pywikibot.inputChoice(u'WARNING: %s doesn\'t seem to be a disambiguation page, but %s is one. Follow it anyway?' % (self.originPage.aslink(True), page.aslink(True)), ['Yes', 'No', 'Add an alternative', 'Give up'], ['y', 'n', 'a', 'g'])
+ choice = pywikibot.inputChoice(u'WARNING: %s doesn\'t seem to be a disambiguation page, but %s is one. Follow it anyway?'
+ % (self.originPage.aslink(True), page.aslink(True)),
+ ['Yes', 'No', 'Add an alternative', 'Give up'],
+ ['y', 'n', 'a', 'g'])
if choice == 'n':
return (True, None)
elif choice == 'a':
- newHint = pywikibot.input(u'Give the alternative for language %s, not using a language code:' % page.site().language())
+ newHint = pywikibot.input(u'Give the alternative for language %s, not using a language code:'
+ % page.site().language())
alternativePage = pywikibot.Page(page.site(), newHint)
return (True, alternativePage)
elif choice == 'g':
@@ -1288,7 +1310,9 @@
if globalvar.skipauto:
dictName, year = page.autoFormat()
if dictName is not None:
- pywikibot.output(u'WARNING: %s:%s relates to %s:%s, which is an auto entry %s(%s)' % (self.originPage.site().language(), self.originPage.title(), page.site().language(),page.title(),dictName,year))
+ pywikibot.output(u'WARNING: %s:%s relates to %s:%s, which is an auto entry %s(%s)'
+ % (self.originPage.site().language(), self.originPage.title(),
+ page.site().language(),page.title(),dictName,year))
# Abort processing if the bot is running in autonomous mode.
if globalvar.autonomous:
@@ -2047,16 +2071,22 @@
if expectedPage != page:
try:
linkedPage = linkedPagesDict[expectedPage.site()]
- pywikibot.output(u"WARNING: %s: %s does not link to %s but to %s" % (page.site().family.name, page.aslink(True), expectedPage.aslink(True), linkedPage.aslink(True)))
+ pywikibot.output(u"WARNING: %s: %s does not link to %s but to %s"
+ % (page.site().family.name, page.aslink(True),
+ expectedPage.aslink(True), linkedPage.aslink(True)))
except KeyError:
- pywikibot.output(u"WARNING: %s: %s does not link to %s" % (page.site().family.name, page.aslink(True), expectedPage.aslink(True)))
+ pywikibot.output(u"WARNING: %s: %s does not link to %s"
+ % (page.site().family.name, page.aslink(True),
+ expectedPage.aslink(True)))
# Check for superfluous links
for linkedPage in linkedPages:
if linkedPage not in expectedPages:
# Check whether there is an alternative page on that language.
# In this case, it was already reported above.
if linkedPage.site() not in expectedSites:
- pywikibot.output(u"WARNING: %s: %s links to incorrect %s" % (page.site().family.name, page.aslink(True), linkedPage.aslink(True)))
+ pywikibot.output(u"WARNING: %s: %s links to incorrect %s"
+ % (page.site().family.name, page.aslink(True),
+ linkedPage.aslink(True)))
except (socket.error, IOError):
pywikibot.output(u'ERROR: could not report backlinks')
Revision: 8760
Author: purodha
Date: 2010-12-05 22:07:02 +0000 (Sun, 05 Dec 2010)
Log Message:
-----------
titletranslate() better documented, saner and slightly quicker code, made more flexible.
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
trunk/pywikipedia/titletranslate.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2010-12-05 21:09:39 UTC (rev 8759)
+++ trunk/pywikipedia/interwiki.py 2010-12-05 22:07:02 UTC (rev 8760)
@@ -108,10 +108,9 @@
These arguments are useful to provide hints to the bot:
- -hint: used as -hint:de:Anweisung to give the robot a hint
- where to start looking for translations. This is only
- useful if you specify a single page to work on. If no
- text is given after the second ':', the name of the page
+ -hint: used as -hint:de:Anweisung to give the robot a hint
+ where to start looking for translations. If no text
+ is given after the second ':', the name of the page
itself is used as the title for the hint, unless the
-hintnobracket command line option (see there) is also
selected.
@@ -122,12 +121,15 @@
* 10: The 10 largest languages (sites with most
articles). Analogous for any other natural
number.
- * arab: All languages useing the Arabic alphabet.
+ * arab: All languages using the Arabic alphabet.
* cyril: All languages that use the Cyrillic alphabet.
* chinese: All Chinese dialects.
* latin: All languages using the Latin script.
* scand: All Scandinavian languages.
+ Languages and groups having the same page title can be
+ combined, as in -hint:5,scand,sr,pt:New_York
+
-hintfile: similar to -hint, except that hints are taken from the given
file, enclosed in [[]] each, instead of the command line.
Modified: trunk/pywikipedia/titletranslate.py
===================================================================
--- trunk/pywikipedia/titletranslate.py 2010-12-05 21:09:39 UTC (rev 8759)
+++ trunk/pywikipedia/titletranslate.py 2010-12-05 22:07:02 UTC (rev 8760)
@@ -13,14 +13,34 @@
import wikipedia as pywikibot
import date
-def translate(page, hints = None, auto = True, removebrackets = False):
+def _join_to_(result, join):
+ for x in join:
+ if x not in result:
+ result.append(x)
+
+def translate(page, hints = None, auto = True, removebrackets = False, site = None, family = None):
"""
Please comment your source code! --Daniel
Does some magic stuff. Returns a list of pages.
+
+ Goes through all entries in 'hints'. Returns a list of pages.
+
+ Entries for single page titles list those pages. Page titles for entries
+ such as "all:" or "xyz:" or "20:" are first built from the page title of
+ 'page' and then listed. When 'removebrackets' is True, a trailing pair of
+ brackets and the text between them is removed from the page title.
+ If 'auto' is true, known year and date page titles are autotranslated
+ to all known target languages and inserted into the list.
+
"""
result = []
- site = page.site()
+ if site is None and page:
+ site = page.site()
+ if family is None and site:
+ family = site.family
+ if site:
+ sitelang = site.language()
if hints:
for h in hints:
if ':' not in h:
@@ -33,9 +53,11 @@
# if given as -hint:xy or -hint:xy:, assume that there should
# be a page in language xy with the same title as the page
# we're currently working on ...
+ if page is None:
+ continue
ns = page.namespace()
if ns:
- newname = u'%s:%s' % (site.family.namespace('_default', ns),
+ newname = u'%s:%s' % (family.namespace('_default', ns),
page.titleWithoutNamespace())
else:
# article in the main namespace
@@ -43,45 +65,49 @@
# ... unless we do want brackets
if removebrackets:
newname = re.sub(re.compile(ur"\W*?\(.*?\)\W*?", re.UNICODE), u" ", newname)
- try:
- number = int(codes)
- codes = site.family.languages_by_size[:number]
- except ValueError:
- if codes == 'all':
- codes = site.family.languages_by_size
- elif codes in site.family.language_groups:
- codes = site.family.language_groups[codes]
- else:
- codes = codes.split(',')
+ codesplit = codes.split(',')
+ codes = []
+ for code in codesplit:
+ try:
+ number = int(code)
+ _join_to_(codes, family.languages_by_size[:number] )
+ except ValueError:
+ if code == 'all':
+ _join_to_(codes, family.languages_by_size )
+ elif code in family.language_groups:
+ _join_to_(codes, family.language_groups[code] )
+ elif code:
+ _join_to_(codes, [ code ] )
for newcode in codes:
- if newcode in site.languages():
- if newcode != site.language():
- x = pywikibot.Page(site.getSite(code=newcode), newname)
- if x not in result:
- result.append(x)
+ x = None
+ if newcode in family.langs.keys():
+ if ( page is None ) or ( newcode != sitelang ):
+ x = pywikibot.Page(pywikibot.getSite(fam=family, code=newcode), newname)
+# elif newcode in family.interwiki_forwarded_from:
+# x = pywikibot.Page(pywikibot.getSite(fam=newcode, code=newcode), newname)
else:
if pywikibot.verbose:
- pywikibot.output(u"Ignoring unknown language code %s"
- % newcode)
+ pywikibot.output(u"Ignoring the unknown language code %s" % newcode)
+ if x:
+ _join_to_(result, [ x ] )
# Autotranslate dates into all other languages, the rest will come from
# existing interwiki links.
- if auto:
+ if auto and page:
# search inside all dictionaries for this link
- dictName, value = date.getAutoFormat(page.site().language(),
- page.title())
+ dictName, value = date.getAutoFormat(sitelang, page.title())
if dictName:
if not (dictName == 'yearsBC' and
- page.site().language() in date.maxyearBC and
- value > date.maxyearBC[page.site().language()]) or \
+ sitelang in date.maxyearBC and
+ value > date.maxyearBC[sitelang]) or \
(dictName == 'yearsAD' and
- page.site().language() in date.maxyearAD and
- value > date.maxyearAD[page.site().language()]):
+ sitelang in date.maxyearAD and
+ value > date.maxyearAD[sitelang]):
pywikibot.output(
u'TitleTranslate: %s was recognized as %s with value %d'
% (page.title(), dictName, value))
for entryLang, entry in date.formats[dictName].iteritems():
- if entryLang != page.site().language():
+ if entryLang != sitelang:
if dictName == 'yearsBC' and \
entryLang in date.maxyearBC and \
value > date.maxyearBC[entryLang]:
@@ -94,9 +120,8 @@
newname = entry(value)
x = pywikibot.Page(
pywikibot.getSite(code=entryLang,
- fam=site.family), newname)
- if x not in result:
- result.append(x) # add new page
+ fam=family), newname)
+ _join_to_(result, [ x ] )
return result
bcDateErrors = [u'[[ko:%d년]]']
Revision: 8759
Author: purodha
Date: 2010-12-05 21:09:39 +0000 (Sun, 05 Dec 2010)
Log Message:
-----------
Make getLanguageLinks() deal correctly with forwarded interlanguage links,
such as on Meta, Commons, Incubator, Species, and the like.
Modified Paths:
--------------
trunk/pywikipedia/pywikibot/textlib.py
Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py 2010-12-05 18:46:44 UTC (rev 8758)
+++ trunk/pywikipedia/pywikibot/textlib.py 2010-12-05 21:09:39 UTC (rev 8759)
@@ -268,12 +268,24 @@
#-------------------------------------------------
# Functions dealing with interwiki language links
#-------------------------------------------------
-# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
-# interproject. These functions only deal with links to a
-# corresponding page in another language on the same project (e.g.,
-# Wikipedia, Wiktionary, etc.) in another language. They do not find
-# or change links to a different project, or any that are formatted
-# as in-line interwiki links (e.g., "[[:es:Articulo]]". (CONFIRM)
+# Note - MediaWiki supports several kinds of interwiki links; two kinds are
+# interlanguage links. We deal here with those kinds only.
+# A family has by definition only one kind of interlanguage links:
+# 1 - interlanguage links inside the own family.
+# They go to a corresponding page in another language in the same
+# family, such as from 'en.wikipedia' to 'pt.wikipedia', or from
+# 'es.wiktionary' to 'arz.wiktionary'.
+# Families with this kind have several language-specific sites.
+# They have their interwiki_forward attribute set to None
+# 2 - language links forwarding to another family.
+# They go to a corresponding page in another family, such as from
+# 'commons' to 'zh.wikipedia, or from 'incubator' to 'en.wikipedia'.
+# Families having those have one member only, and do not have
+# language-specific sites. The name of the target family of their
+# interlanguage links is kept in their interwiki_forward attribute.
+# These functions only deal with links of these two kinds only. They
+# do not find or change links of other kinds, nor any that are formatted
+# as in-line interwiki links (e.g., "[[:es:Articulo]]".
def getLanguageLinks(text, insite=None, pageLink="[[]]", template_subpage=False):
"""
@@ -286,6 +298,10 @@
"""
if insite is None:
insite = pywikibot.getSite()
+ fam = insite.family
+ # when interwiki links forward to another family, retrieve pages & other infos there
+ if fam.interwiki_forward:
+ fam = pywikibot.Family(fam.interwiki_forward)
result = {}
# Ignore interwiki links within nowiki tags, includeonly tags, pre tags,
# and HTML comments
@@ -298,19 +314,21 @@
# interwiki link.
# NOTE: language codes are case-insensitive and only consist of basic latin
# letters and hyphens.
+ #TODO: currently, we do not have any, but BCP 47 allows digits, and underscores.
+ #TODO: There is no semantic difference between hyphens and underscores -> fold them.
interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
for lang, pagetitle in interwikiR.findall(text):
lang = lang.lower()
# Check if it really is in fact an interwiki link to a known
# language, or if it's e.g. a category tag or an internal link
- if lang in insite.family.obsolete:
- lang = insite.family.obsolete[lang]
- if lang in insite.validLanguageLinks():
+ if lang in fam.obsolete:
+ lang = fam.obsolete[lang]
+ if lang in fam.langs.keys():
if '|' in pagetitle:
# ignore text after the pipe
pagetitle = pagetitle[:pagetitle.index('|')]
# we want the actual page objects rather than the titles
- site = insite.getSite(code = lang)
+ site = pywikibot.getSite(code=lang, fam=fam)
try:
result[site] = pywikibot.Page(site, pagetitle, insite=insite)
except pywikibot.InvalidTitle: