Pywikipedia-svn

pywikipedia-svn@lists.wikimedia.org

5163 discussions

SVN: [6701] trunk/pywikipedia/interwiki.py
by nicdumz＠svn.wikimedia.org 25 Apr '09

25 Apr '09

Revision: 6701 Author: nicdumz Date: 2009-04-25 08:35:16 +0000 (Sat, 25 Apr 2009) Log Message: ----------- Putting the easy to evaluate condition first Modified Paths: -------------- trunk/pywikipedia/interwiki.py Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-25 07:29:01 UTC (rev 6700) +++ trunk/pywikipedia/interwiki.py 2009-04-25 08:35:16 UTC (rev 6701) @@ -932,11 +932,11 @@ self.todo = PageTree() elif not globalvar.followredirect: wikipedia.output(u"NOTE: not following redirects.") - else: - if not (self.skipPage(page, redirectTargetPage, counter) or (page.site().family != redirectTargetPage.site().family)): - if self.addIfNew(redirectTargetPage, counter, page): - if config.interwiki_shownew: - wikipedia.output(u"%s: %s gives new redirect %s" % (self.originPage.aslink(), page.aslink(True), redirectTargetPage.aslink(True))) + elif page.site().family == redirectTargetPage.site().family \ + and not self.skipPage(page, redirectTargetPage, counter): + if self.addIfNew(redirectTargetPage, counter, page): + if config.interwiki_shownew: + wikipedia.output(u"%s: %s gives new redirect %s" % (self.originPage.aslink(), page.aslink(True), redirectTargetPage.aslink(True))) continue

1 0

SVN: [6700] trunk/pywikipedia/interwiki.py
by nicdumz＠svn.wikimedia.org 25 Apr '09

25 Apr '09

Revision: 6700 Author: nicdumz Date: 2009-04-25 07:29:01 +0000 (Sat, 25 Apr 2009) Log Message: ----------- Using a new skipPage function instead of a long condition Modified Paths: -------------- trunk/pywikipedia/interwiki.py Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-25 07:16:09 UTC (rev 6699) +++ trunk/pywikipedia/interwiki.py 2009-04-25 07:29:01 UTC (rev 6700) @@ -716,6 +716,11 @@ counter.plus(page.site()) return True + def skipPage(self, page, target, counter): + return isIgnored(target) or \ + self.namespaceMismatch(page, target, counter) or \ + self.wiktionaryMismatch(target) + def namespaceMismatch(self, linkingPage, linkedPage, counter): """ Checks whether or not the given page has another namespace @@ -928,7 +933,7 @@ elif not globalvar.followredirect: wikipedia.output(u"NOTE: not following redirects.") else: - if not (self.isIgnored(redirectTargetPage) or self.namespaceMismatch(page, redirectTargetPage, counter) or self.wiktionaryMismatch(redirectTargetPage) or (page.site().family != redirectTargetPage.site().family)): + if not (self.skipPage(page, redirectTargetPage, counter) or (page.site().family != redirectTargetPage.site().family)): if self.addIfNew(redirectTargetPage, counter, page): if config.interwiki_shownew: wikipedia.output(u"%s: %s gives new redirect %s" % (self.originPage.aslink(), page.aslink(True), redirectTargetPage.aslink(True))) @@ -1001,7 +1006,7 @@ if linkedPage.site in self.hintedsites: wikipedia.output(u"NOTE: %s: %s extra interwiki on hinted site ignored %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) break - if not (self.isIgnored(linkedPage) or self.namespaceMismatch(page, linkedPage, counter) or self.wiktionaryMismatch(linkedPage)): + if not self.skipPage(page, linkedPage, counter): if globalvar.followinterwiki or page == self.originPage: if self.addIfNew(linkedPage, counter, page): # It is new. Also verify whether it is the second on the

1 0

SVN: [6699] trunk/pywikipedia/interwiki.py
by nicdumz＠svn.wikimedia.org 25 Apr '09

25 Apr '09

Revision: 6699 Author: nicdumz Date: 2009-04-25 07:16:09 +0000 (Sat, 25 Apr 2009) Log Message: ----------- Use a continue in the only except: clause, to suppress a huge "else" branch. Modified Paths: -------------- trunk/pywikipedia/interwiki.py Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-25 07:10:09 UTC (rev 6698) +++ trunk/pywikipedia/interwiki.py 2009-04-25 07:16:09 UTC (rev 6699) @@ -945,76 +945,78 @@ iw = page.interwiki() except wikipedia.NoSuchSite: wikipedia.output(u"NOTE: site %s does not exist" % page.site()) - else: - (skip, alternativePage) = self.disambigMismatch(page, counter) - if skip: - wikipedia.output(u"NOTE: ignoring %s and its interwiki links" % page.aslink(True)) - self.done = PageTree() - iw = () - if alternativePage: - # add the page that was entered by the user - self.addIfNew(alternativePage, counter, None) + continue - if self.originPage == page: - self.untranslated = (len(iw) == 0) - if globalvar.untranslatedonly: - # Ignore the interwiki links. - iw = () - # FIXME: the filtered list generated in the condition is - # re-generated the lign after. - # And we only use the first item of that list. - elif globalvar.autonomous and [p for p in self.done.filter(page.site()) if p != page and p.exists() and not p.isRedirectPage()]: - - for p in self.done.filter(page.site()): - if p != page and p.exists() and \ - not p.isRedirectPage(): - otherpage = p - break - wikipedia.output(u"Stopping work on %s because duplicate pages %s and %s are found"%(self.originPage.aslink(),otherpage.aslink(True),page.aslink(True))) - self.makeForcedStop(counter) - try: - f = codecs.open( - wikipedia.config.datafilepath('autonomous_problems.dat'), - 'a', 'utf-8') - f.write("* %s {Found more than one link for %s}" % (self.originPage.aslink(True), page.site())) - if config.interwiki_graph and config.interwiki_graph_url: - filename = interwiki_graph.getFilename(self.originPage, extension = config.interwiki_graph_formats[0]) - f.write(" [%s%s graph]" % (config.interwiki_graph_url, filename)) - f.write("\n") - f.close() - # FIXME: What errors are we catching here? - # except: should be avoided!! - except: - #raise - wikipedia.output(u'File autonomous_problem.dat open or corrupted! Try again with -restore.') - sys.exit() + (skip, alternativePage) = self.disambigMismatch(page, counter) + if skip: + wikipedia.output(u"NOTE: ignoring %s and its interwiki links" % page.aslink(True)) + self.done = PageTree() + iw = () + if alternativePage: + # add the page that was entered by the user + self.addIfNew(alternativePage, counter, None) + + if self.originPage == page: + self.untranslated = (len(iw) == 0) + if globalvar.untranslatedonly: + # Ignore the interwiki links. iw = () - elif page.isEmpty() and not page.isCategory(): - wikipedia.output(u"NOTE: %s is empty; ignoring it and its interwiki links" % page.aslink(True)) - # Ignore the interwiki links - self.done = PageTree() - iw = () - for linkedPage in iw: - if globalvar.hintsareright: - if linkedPage.site in self.hintedsites: - wikipedia.output(u"NOTE: %s: %s extra interwiki on hinted site ignored %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) - break - if not (self.isIgnored(linkedPage) or self.namespaceMismatch(page, linkedPage, counter) or self.wiktionaryMismatch(linkedPage)): - if globalvar.followinterwiki or page == self.originPage: - if self.addIfNew(linkedPage, counter, page): - # It is new. Also verify whether it is the second on the - # same site - lpsite=linkedPage.site() - for prevPage in self.foundIn.keys(): - if prevPage != linkedPage and prevPage.site() == lpsite: - # Still, this could be "no problem" as either may be a - # redirect to the other. No way to find out quickly! - wikipedia.output(u"NOTE: %s: %s gives duplicate interwiki on same site %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) - break - else: - if config.interwiki_shownew: - wikipedia.output(u"%s: %s gives new interwiki %s"% (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) + # FIXME: the filtered list generated in the condition is + # re-generated the lign after. + # And we only use the first item of that list. + elif globalvar.autonomous and [p for p in self.done.filter(page.site()) if p != page and p.exists() and not p.isRedirectPage()]: + + for p in self.done.filter(page.site()): + if p != page and p.exists() and \ + not p.isRedirectPage(): + otherpage = p + break + wikipedia.output(u"Stopping work on %s because duplicate pages %s and %s are found"%(self.originPage.aslink(),otherpage.aslink(True),page.aslink(True))) + self.makeForcedStop(counter) + try: + f = codecs.open( + wikipedia.config.datafilepath('autonomous_problems.dat'), + 'a', 'utf-8') + f.write("* %s {Found more than one link for %s}" % (self.originPage.aslink(True), page.site())) + if config.interwiki_graph and config.interwiki_graph_url: + filename = interwiki_graph.getFilename(self.originPage, extension = config.interwiki_graph_formats[0]) + f.write(" [%s%s graph]" % (config.interwiki_graph_url, filename)) + f.write("\n") + f.close() + # FIXME: What errors are we catching here? + # except: should be avoided!! + except: + #raise + wikipedia.output(u'File autonomous_problem.dat open or corrupted! Try again with -restore.') + sys.exit() + iw = () + elif page.isEmpty() and not page.isCategory(): + wikipedia.output(u"NOTE: %s is empty; ignoring it and its interwiki links" % page.aslink(True)) + # Ignore the interwiki links + self.done = PageTree() + iw = () + for linkedPage in iw: + if globalvar.hintsareright: + if linkedPage.site in self.hintedsites: + wikipedia.output(u"NOTE: %s: %s extra interwiki on hinted site ignored %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) + break + if not (self.isIgnored(linkedPage) or self.namespaceMismatch(page, linkedPage, counter) or self.wiktionaryMismatch(linkedPage)): + if globalvar.followinterwiki or page == self.originPage: + if self.addIfNew(linkedPage, counter, page): + # It is new. Also verify whether it is the second on the + # same site + lpsite=linkedPage.site() + for prevPage in self.foundIn.keys(): + if prevPage != linkedPage and prevPage.site() == lpsite: + # Still, this could be "no problem" as either may be a + # redirect to the other. No way to find out quickly! + wikipedia.output(u"NOTE: %s: %s gives duplicate interwiki on same site %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) + break + else: + if config.interwiki_shownew: + wikipedia.output(u"%s: %s gives new interwiki %s"% (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) + # These pages are no longer 'in progress' self.pending = PageTree() # Check whether we need hints and the user offered to give them

1 0

SVN: [6698] trunk/pywikipedia/interwiki.py
by nicdumz＠svn.wikimedia.org 25 Apr '09

25 Apr '09

Revision: 6698 Author: nicdumz Date: 2009-04-25 07:10:09 +0000 (Sat, 25 Apr 2009) Log Message: ----------- Using 3 "continue" to abandon modifications on the current page, to avoid being in a huge else: for the main part of the code. Modified Paths: -------------- trunk/pywikipedia/interwiki.py Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-25 07:05:28 UTC (rev 6697) +++ trunk/pywikipedia/interwiki.py 2009-04-25 07:10:09 UTC (rev 6698) @@ -902,6 +902,7 @@ self.todo = PageTree() # In some rare cases it might be we already did check some 'automatic' links self.done = PageTree() + continue elif page.isRedirectPage(): redirectTargetPage = page.getRedirectTarget() @@ -932,82 +933,87 @@ if config.interwiki_shownew: wikipedia.output(u"%s: %s gives new redirect %s" % (self.originPage.aslink(), page.aslink(True), redirectTargetPage.aslink(True))) - elif not page.section(): - # Page exists, isnt a redirect, and is a plain link (no section) + continue - try: - iw = page.interwiki() - except wikipedia.NoSuchSite: - wikipedia.output(u"NOTE: site %s does not exist" % page.site()) - else: - (skip, alternativePage) = self.disambigMismatch(page, counter) - if skip: - wikipedia.output(u"NOTE: ignoring %s and its interwiki links" % page.aslink(True)) - self.done = PageTree() - iw = () - if alternativePage: - # add the page that was entered by the user - self.addIfNew(alternativePage, counter, None) + elif page.section(): + continue - if self.originPage == page: - self.untranslated = (len(iw) == 0) - if globalvar.untranslatedonly: - # Ignore the interwiki links. - iw = () - # FIXME: the filtered list generated in the condition is - # re-generated the lign after. - # And we only use the first item of that list. - elif globalvar.autonomous and [p for p in self.done.filter(page.site()) if p != page and p.exists() and not p.isRedirectPage()]: - - for p in self.done.filter(page.site()): - if p != page and p.exists() and \ - not p.isRedirectPage(): - otherpage = p - break - wikipedia.output(u"Stopping work on %s because duplicate pages %s and %s are found"%(self.originPage.aslink(),otherpage.aslink(True),page.aslink(True))) - self.makeForcedStop(counter) - try: - f = codecs.open( - wikipedia.config.datafilepath('autonomous_problems.dat'), - 'a', 'utf-8') - f.write("* %s {Found more than one link for %s}" % (self.originPage.aslink(True), page.site())) - if config.interwiki_graph and config.interwiki_graph_url: - filename = interwiki_graph.getFilename(self.originPage, extension = config.interwiki_graph_formats[0]) - f.write(" [%s%s graph]" % (config.interwiki_graph_url, filename)) - f.write("\n") - f.close() - # FIXME: What errors are we catching here? - # except: should be avoided!! - except: - #raise - wikipedia.output(u'File autonomous_problem.dat open or corrupted! Try again with -restore.') - sys.exit() + + # Page exists, isnt a redirect, and is a plain link (no section) + + try: + iw = page.interwiki() + except wikipedia.NoSuchSite: + wikipedia.output(u"NOTE: site %s does not exist" % page.site()) + else: + (skip, alternativePage) = self.disambigMismatch(page, counter) + if skip: + wikipedia.output(u"NOTE: ignoring %s and its interwiki links" % page.aslink(True)) + self.done = PageTree() + iw = () + if alternativePage: + # add the page that was entered by the user + self.addIfNew(alternativePage, counter, None) + + if self.originPage == page: + self.untranslated = (len(iw) == 0) + if globalvar.untranslatedonly: + # Ignore the interwiki links. iw = () - elif page.isEmpty() and not page.isCategory(): - wikipedia.output(u"NOTE: %s is empty; ignoring it and its interwiki links" % page.aslink(True)) - # Ignore the interwiki links - self.done = PageTree() - iw = () - for linkedPage in iw: - if globalvar.hintsareright: - if linkedPage.site in self.hintedsites: - wikipedia.output(u"NOTE: %s: %s extra interwiki on hinted site ignored %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) - break - if not (self.isIgnored(linkedPage) or self.namespaceMismatch(page, linkedPage, counter) or self.wiktionaryMismatch(linkedPage)): - if globalvar.followinterwiki or page == self.originPage: - if self.addIfNew(linkedPage, counter, page): - # It is new. Also verify whether it is the second on the - # same site - lpsite=linkedPage.site() - for prevPage in self.foundIn.keys(): - if prevPage != linkedPage and prevPage.site() == lpsite: - # Still, this could be "no problem" as either may be a - # redirect to the other. No way to find out quickly! - wikipedia.output(u"NOTE: %s: %s gives duplicate interwiki on same site %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) - break - else: - if config.interwiki_shownew: - wikipedia.output(u"%s: %s gives new interwiki %s"% (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) + # FIXME: the filtered list generated in the condition is + # re-generated the lign after. + # And we only use the first item of that list. + elif globalvar.autonomous and [p for p in self.done.filter(page.site()) if p != page and p.exists() and not p.isRedirectPage()]: + + for p in self.done.filter(page.site()): + if p != page and p.exists() and \ + not p.isRedirectPage(): + otherpage = p + break + wikipedia.output(u"Stopping work on %s because duplicate pages %s and %s are found"%(self.originPage.aslink(),otherpage.aslink(True),page.aslink(True))) + self.makeForcedStop(counter) + try: + f = codecs.open( + wikipedia.config.datafilepath('autonomous_problems.dat'), + 'a', 'utf-8') + f.write("* %s {Found more than one link for %s}" % (self.originPage.aslink(True), page.site())) + if config.interwiki_graph and config.interwiki_graph_url: + filename = interwiki_graph.getFilename(self.originPage, extension = config.interwiki_graph_formats[0]) + f.write(" [%s%s graph]" % (config.interwiki_graph_url, filename)) + f.write("\n") + f.close() + # FIXME: What errors are we catching here? + # except: should be avoided!! + except: + #raise + wikipedia.output(u'File autonomous_problem.dat open or corrupted! Try again with -restore.') + sys.exit() + iw = () + elif page.isEmpty() and not page.isCategory(): + wikipedia.output(u"NOTE: %s is empty; ignoring it and its interwiki links" % page.aslink(True)) + # Ignore the interwiki links + self.done = PageTree() + iw = () + for linkedPage in iw: + if globalvar.hintsareright: + if linkedPage.site in self.hintedsites: + wikipedia.output(u"NOTE: %s: %s extra interwiki on hinted site ignored %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) + break + if not (self.isIgnored(linkedPage) or self.namespaceMismatch(page, linkedPage, counter) or self.wiktionaryMismatch(linkedPage)): + if globalvar.followinterwiki or page == self.originPage: + if self.addIfNew(linkedPage, counter, page): + # It is new. Also verify whether it is the second on the + # same site + lpsite=linkedPage.site() + for prevPage in self.foundIn.keys(): + if prevPage != linkedPage and prevPage.site() == lpsite: + # Still, this could be "no problem" as either may be a + # redirect to the other. No way to find out quickly! + wikipedia.output(u"NOTE: %s: %s gives duplicate interwiki on same site %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) + break + else: + if config.interwiki_shownew: + wikipedia.output(u"%s: %s gives new interwiki %s"% (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) # These pages are no longer 'in progress' self.pending = PageTree()

1 0

SVN: [6697] trunk/pywikipedia/interwiki.py
by nicdumz＠svn.wikimedia.org 25 Apr '09

25 Apr '09

Revision: 6697 Author: nicdumz Date: 2009-04-25 07:05:28 +0000 (Sat, 25 Apr 2009) Log Message: ----------- Putting page.exists(); page.isRedirect() tests before page.get() It has exactly the same behavior. It just removes a level of indentation. Cleaner code. Modified Paths: -------------- trunk/pywikipedia/interwiki.py Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-24 12:44:52 UTC (rev 6696) +++ trunk/pywikipedia/interwiki.py 2009-04-25 07:05:28 UTC (rev 6697) @@ -890,55 +890,55 @@ counter.minus(page.site()) # Now check whether any interwiki links should be added to the # todo list. - if page.section() and not page.isRedirectPage(): - # We have been referred to a part of a page, not the whole page. Do not follow references. - pass - else: - try: - iw = page.interwiki() - except wikipedia.IsRedirectPage, arg: - redirectTargetPage = wikipedia.Page(page.site(), arg.args[0]) - wikipedia.output(u"NOTE: %s is redirect to %s" % (page.aslink(True), redirectTargetPage.aslink(True))) - if page == self.originPage: - if globalvar.initialredirect: - self.originPage = redirectTargetPage - #XXX might not work if page.site != redirTar.site: - # We are appending an item to - # self.pending[redirTar.site] - # but we are iterating on self.pending at the same - # time. - # On the other hand... crosslanguage redirects? - self.pending.add(redirectTargetPage) - counter.plus(redirectTargetPage.site) - else: - # This is a redirect page to the origin. We don't need to - # follow the redirection. - # In this case we can also stop all hints! - for site, count in self.todo.siteCounts(): - counter.minus(site, count) - self.todo = PageTree() - elif not globalvar.followredirect: - wikipedia.output(u"NOTE: not following redirects.") + + + if not page.exists(): + wikipedia.output(u"NOTE: %s does not exist" % page.aslink(True)) + if page == self.originPage: + # The page we are working on is the page that does not exist. + # No use in doing any work on it in that case. + for site, count in self.todo.siteCounts(): + counter.minus(site, count) + self.todo = PageTree() + # In some rare cases it might be we already did check some 'automatic' links + self.done = PageTree() + + elif page.isRedirectPage(): + redirectTargetPage = page.getRedirectTarget() + wikipedia.output(u"NOTE: %s is redirect to %s" % (page.aslink(True), redirectTargetPage.aslink(True))) + if page == self.originPage: + if globalvar.initialredirect: + self.originPage = redirectTargetPage + #XXX might not work if page.site != redirTar.site: + # We are appending an item to + # self.pending[redirTar.site] + # but we are iterating on self.pending at the same + # time. + # On the other hand... crosslanguage redirects? + self.pending.add(redirectTargetPage) + counter.plus(redirectTargetPage.site) else: - if not (self.isIgnored(redirectTargetPage) or self.namespaceMismatch(page, redirectTargetPage, counter) or self.wiktionaryMismatch(redirectTargetPage) or (page.site().family != redirectTargetPage.site().family)): - if self.addIfNew(redirectTargetPage, counter, page): - if config.interwiki_shownew: - wikipedia.output(u"%s: %s gives new redirect %s" % (self.originPage.aslink(), page.aslink(True), redirectTargetPage.aslink(True))) - except wikipedia.NoPage: - wikipedia.output(u"NOTE: %s does not exist" % page.aslink(True)) - if page == self.originPage: - # The page we are working on is the page that does not exist. - # No use in doing any work on it in that case. + # This is a redirect page to the origin. We don't need to + # follow the redirection. + # In this case we can also stop all hints! for site, count in self.todo.siteCounts(): counter.minus(site, count) self.todo = PageTree() - # In some rare cases it might be we already did check some 'automatic' links - self.done = PageTree() - pass + elif not globalvar.followredirect: + wikipedia.output(u"NOTE: not following redirects.") + else: + if not (self.isIgnored(redirectTargetPage) or self.namespaceMismatch(page, redirectTargetPage, counter) or self.wiktionaryMismatch(redirectTargetPage) or (page.site().family != redirectTargetPage.site().family)): + if self.addIfNew(redirectTargetPage, counter, page): + if config.interwiki_shownew: + wikipedia.output(u"%s: %s gives new redirect %s" % (self.originPage.aslink(), page.aslink(True), redirectTargetPage.aslink(True))) + + elif not page.section(): + # Page exists, isnt a redirect, and is a plain link (no section) + + try: + iw = page.interwiki() except wikipedia.NoSuchSite: wikipedia.output(u"NOTE: site %s does not exist" % page.site()) - #except wikipedia.SectionError: - # wikipedia.output(u"NOTE: section %s does not exist" % page.aslink()) else: (skip, alternativePage) = self.disambigMismatch(page, counter) if skip: @@ -976,6 +976,8 @@ f.write(" [%s%s graph]" % (config.interwiki_graph_url, filename)) f.write("\n") f.close() + # FIXME: What errors are we catching here? + # except: should be avoided!! except: #raise wikipedia.output(u'File autonomous_problem.dat open or corrupted! Try again with -restore.')

1 0

SVN: [6696] branches/rewrite/pywikibot/pagegenerators.py
by russblau＠svn.wikimedia.org 24 Apr '09

24 Apr '09

Revision: 6696 Author: russblau Date: 2009-04-24 12:44:52 +0000 (Fri, 24 Apr 2009) Log Message: ----------- Update DuplicateFilterPageGenerator as per recent changes to trunk Modified Paths: -------------- branches/rewrite/pywikibot/pagegenerators.py Modified: branches/rewrite/pywikibot/pagegenerators.py =================================================================== --- branches/rewrite/pywikibot/pagegenerators.py 2009-04-24 06:46:34 UTC (rev 6695) +++ branches/rewrite/pywikibot/pagegenerators.py 2009-04-24 12:44:52 UTC (rev 6696) @@ -605,10 +605,13 @@ def DuplicateFilterPageGenerator(generator): """Yield all unique pages from another generator, omitting duplicates.""" - seenPages = set([]) + seenPages = {} for page in generator: if page not in seenPages: - seenPages.add(page) + _page = u"%s:%s:%s" % (page._site.family.name, + page._site.code, + page._title) + seenPages[_page] = True yield page

1 0

SVN: [6695] trunk/pywikipedia/interwiki.py
by nicdumz＠svn.wikimedia.org 24 Apr '09

24 Apr '09

Revision: 6695 Author: nicdumz Date: 2009-04-24 06:46:34 +0000 (Fri, 24 Apr 2009) Log Message: ----------- PageTree bugfix: use iteritems() Modified Paths: -------------- trunk/pywikipedia/interwiki.py Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-24 06:38:24 UTC (rev 6694) +++ trunk/pywikipedia/interwiki.py 2009-04-24 06:46:34 UTC (rev 6695) @@ -545,11 +545,11 @@ """ Yields (Site, number of pages in site) pairs """ - for site, d in self.tree: + for site, d in self.tree.iteritems(): yield site, len(d) def __iter__(self): - for site, d in self.tree: + for site, d in self.tree.iteritems(): for page in d: yield page

1 0

SVN: [6694] trunk/pywikipedia/interwiki.py
by nicdumz＠svn.wikimedia.org 24 Apr '09

24 Apr '09

Revision: 6694 Author: nicdumz Date: 2009-04-24 06:38:24 +0000 (Fri, 24 Apr 2009) Log Message: ----------- PageTree internals : dict -> list PageTree is never used to reference individual pages. Only operations on tree[site] are: * len * add No individual Page lookups? Then tree[site] does not need to be a dict. Using a list instead. Modified Paths: -------------- trunk/pywikipedia/interwiki.py Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-24 06:31:34 UTC (rev 6693) +++ trunk/pywikipedia/interwiki.py 2009-04-24 06:38:24 UTC (rev 6694) @@ -527,8 +527,8 @@ def add(self, page): site = page.site() if not site in self.tree: - self.tree[site] = {} - self.tree[site][page] = True + self.tree[site] = [] + self.tree[site].append(page) self.size += 1 def removeSite(self, site):

1 0

SVN: [6693] trunk/pywikipedia/interwiki.py
by nicdumz＠svn.wikimedia.org 24 Apr '09

24 Apr '09

Revision: 6693 Author: nicdumz Date: 2009-04-24 06:31:34 +0000 (Fri, 24 Apr 2009) Log Message: ----------- PageTree.remove -> PageTree.removeSite And now that remove() is only used once to remove all pages from a single site, use removeSite instead. Modified Paths: -------------- trunk/pywikipedia/interwiki.py Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-24 06:26:16 UTC (rev 6692) +++ trunk/pywikipedia/interwiki.py 2009-04-24 06:31:34 UTC (rev 6693) @@ -531,9 +531,15 @@ self.tree[site][page] = True self.size += 1 - def remove(self, page): - del self.tree[site][page] - self.size -= 1 + def removeSite(self, site): + """ + Removes all pages from Site site + """ + try: + self.size -= len(self.tree[site]) + del self.tree[site] + except KeyError: + pass def siteCounts(self): """ @@ -669,8 +675,8 @@ for page in self.todo.filter(site): self.pending.add(page) result.append(page) - for page in self.pending.filter(site): - self.todo.remove(page) + + self.todo.removeSite(site) # If there are any, return them. Otherwise, nothing is in progress. return result

1 0

SVN: [6692] trunk/pywikipedia/interwiki.py
by nicdumz＠svn.wikimedia.org 24 Apr '09

24 Apr '09

Revision: 6692 Author: nicdumz Date: 2009-04-24 06:26:16 +0000 (Fri, 24 Apr 2009) Log Message: ----------- Adding a PageTree structure to manipulate Page sets instead of using lists. This performs more efficiently when filtering these sets per Site. Modified Paths: -------------- trunk/pywikipedia/interwiki.py Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-24 05:38:49 UTC (rev 6691) +++ trunk/pywikipedia/interwiki.py 2009-04-24 06:26:16 UTC (rev 6692) @@ -502,6 +502,51 @@ nobackonly = False hintsareright = False +class PageTree(object): + """ + Structure to manipulate a set of pages. + Allows filtering efficiently by Site. + """ + def __init__(self): + self.tree = {} + self.size = 0 + + def filter(self, site): + """ + Iterates over pages that are in Site site + """ + try: + for page in self.tree[site]: + yield page + except KeyError: + pass + + def __len__(self): + return self.size + + def add(self, page): + site = page.site() + if not site in self.tree: + self.tree[site] = {} + self.tree[site][page] = True + self.size += 1 + + def remove(self, page): + del self.tree[site][page] + self.size -= 1 + + def siteCounts(self): + """ + Yields (Site, number of pages in site) pairs + """ + for site, d in self.tree: + yield site, len(d) + + def __iter__(self): + for site, d in self.tree: + for page in d: + yield page + class Subject(object): """ Class to follow the progress of a single 'subject' (i.e. a page with @@ -515,10 +560,12 @@ self.originPage = originPage # todo is a list of all pages that still need to be analyzed. # Mark the origin page as todo. - self.todo = [originPage] + self.todo = PageTree() + self.todo.add(originPage) + # done is a list of all pages that have been analyzed and that # are known to belong to this subject. - self.done = [] + self.done = PageTree() # foundIn is a dictionary where pages are keys and lists of # pages are values. It stores where we found each page. # As we haven't yet found a page that links to the origin page, we @@ -526,7 +573,7 @@ self.foundIn = {self.originPage:[]} # This is a list of all pages that are currently scheduled for # download. - self.pending = [] + self.pending = PageTree() if globalvar.hintsareright: # This is a set of sites that we got hits to self.hintedsites = set() @@ -544,8 +591,8 @@ first one will be returned. Otherwise, None will be returned. """ - for page in self.done + self.pending: - if page.site() == site: + for tree in [self.done, self.pending]: + for page in tree.filter(site): if page.exists() and page.isDisambig(): return page return None @@ -557,8 +604,8 @@ first one will be returned. Otherwise, None will be returned. """ - for page in self.done + self.pending: - if page.site() == site: + for tree in [self.done, self.pending]: + for page in tree.filter(site): if page.exists() and not page.isDisambig() and not page.isRedirectPage(): return page return None @@ -570,8 +617,8 @@ have been found, the first one will be returned. Otherwise, None will be returned. """ - for page in self.done + self.pending + self.todo: - if page.site() == site: + for tree in [self.done, self.pending, self.todo]: + for page in tree.filter(site): if page.namespace() == self.originPage.namespace(): if page.exists() and not page.isRedirectPage(): return page @@ -590,7 +637,7 @@ pages = titletranslate.translate(self.originPage, hints = hints, auto = globalvar.auto, removebrackets = globalvar.hintnobracket) for page in pages: - self.todo.append(page) + self.todo.add(page) self.foundIn[page] = [None] if keephintedsites: self.hintedsites.add(page.site) @@ -603,12 +650,8 @@ """ siteCount = {} - for page in self.todo: - site = page.site() - try: - siteCount[site] += 1 - except KeyError: - siteCount[site] = 1 + for site, count in self.todo.siteCounts(): + siteCount[site] = count return siteCount def willWorkOn(self, site): @@ -619,24 +662,25 @@ """ # Bug-check: Isn't there any work still in progress? We can't work on # different sites at a time! - if self.pending != []: + if len(self.pending) > 0: raise 'BUG: Can\'t start to work on %s; still working on %s' % (site, self.pending) # Prepare a list of suitable pages - for page in self.todo: - if page.site() == site: - self.pending.append(page) - for page in self.pending: + result = [] + for page in self.todo.filter(site): + self.pending.add(page) + result.append(page) + for page in self.pending.filter(site): self.todo.remove(page) # If there are any, return them. Otherwise, nothing is in progress. - return self.pending + return result def makeForcedStop(self,counter): """ Ends work on the page before the normal end. """ - for page in self.todo: - counter.minus(page.site()) - self.todo = [] + for site, count in self.todo.siteCounts(): + counter.minus(site, count) + self.todo = PageTree() self.forcedStop = True def addIfNew(self, page, counter, linkingPage): @@ -662,7 +706,7 @@ return False else: self.foundIn[page] = [linkingPage] - self.todo.append(page) + self.todo.add(page) counter.plus(page.site()) return True @@ -828,7 +872,7 @@ # Loop over all the pages that should have been taken care of for page in self.pending: # Mark the page as done - self.done.append(page) + self.done.add(page) # make sure that none of the linked items is an auto item if globalvar.skipauto: @@ -852,15 +896,21 @@ if page == self.originPage: if globalvar.initialredirect: self.originPage = redirectTargetPage - self.pending.append(redirectTargetPage) + #XXX might not work if page.site != redirTar.site: + # We are appending an item to + # self.pending[redirTar.site] + # but we are iterating on self.pending at the same + # time. + # On the other hand... crosslanguage redirects? + self.pending.add(redirectTargetPage) counter.plus(redirectTargetPage.site) else: # This is a redirect page to the origin. We don't need to # follow the redirection. # In this case we can also stop all hints! - for page2 in self.todo: - counter.minus(page2.site()) - self.todo = [] + for site, count in self.todo.siteCounts(): + counter.minus(site, count) + self.todo = PageTree() elif not globalvar.followredirect: wikipedia.output(u"NOTE: not following redirects.") else: @@ -873,10 +923,11 @@ if page == self.originPage: # The page we are working on is the page that does not exist. # No use in doing any work on it in that case. - for page2 in self.todo: - counter.minus(page2.site()) - self.todo = [] - self.done = [] # In some rare cases it might be we already did check some 'automatic' links + for site, count in self.todo.siteCounts(): + counter.minus(site, count) + self.todo = PageTree() + # In some rare cases it might be we already did check some 'automatic' links + self.done = PageTree() pass except wikipedia.NoSuchSite: wikipedia.output(u"NOTE: site %s does not exist" % page.site()) @@ -886,8 +937,7 @@ (skip, alternativePage) = self.disambigMismatch(page, counter) if skip: wikipedia.output(u"NOTE: ignoring %s and its interwiki links" % page.aslink(True)) - if page in self.done: #XXX: Ugly bugfix - the following line has reportedly thrown "ValueError: list.remove(x): x not in list" - self.done.remove(page) + self.done = PageTree() iw = () if alternativePage: # add the page that was entered by the user @@ -898,11 +948,14 @@ if globalvar.untranslatedonly: # Ignore the interwiki links. iw = () - elif globalvar.autonomous and page.site() in [p.site() for p in self.done if p != page and p.exists() and not p.isRedirectPage()]: + # FIXME: the filtered list generated in the condition is + # re-generated the lign after. + # And we only use the first item of that list. + elif globalvar.autonomous and [p for p in self.done.filter(page.site()) if p != page and p.exists() and not p.isRedirectPage()]: - for p in self.done: - if p.site() == page.site() and p != page \ - and p.exists() and not p.isRedirectPage(): + for p in self.done.filter(page.site()): + if p != page and p.exists() and \ + not p.isRedirectPage(): otherpage = p break wikipedia.output(u"Stopping work on %s because duplicate pages %s and %s are found"%(self.originPage.aslink(),otherpage.aslink(True),page.aslink(True))) @@ -925,8 +978,7 @@ elif page.isEmpty() and not page.isCategory(): wikipedia.output(u"NOTE: %s is empty; ignoring it and its interwiki links" % page.aslink(True)) # Ignore the interwiki links - if page in self.done: #XXX: Ugly bugfix - the following line has reportedly thrown "ValueError: list.remove(x): x not in list" - self.done.remove(page) + self.done = PageTree() iw = () for linkedPage in iw: if globalvar.hintsareright: @@ -950,7 +1002,7 @@ wikipedia.output(u"%s: %s gives new interwiki %s"% (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) # These pages are no longer 'in progress' - self.pending = [] + self.pending = PageTree() # Check whether we need hints and the user offered to give them if self.untranslated and not self.hintsAsked: self.reportInterwikilessPage(page) @@ -1581,9 +1633,9 @@ except KeyError: self.counts[site] = count - def minus(self, site): + def minus(self, site, count=1): """This is a routine that the Subject class expects in a counter""" - self.counts[site] -= 1 + self.counts[site] -= count def run(self): """Start the process until finished"""

1 0

Jump to page:

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

Pywikipedia-svn