Revision: 4260 Author: wikipedian Date: 2007-09-12 22:12:11 +0000 (Wed, 12 Sep 2007)
Log Message: ----------- created methods to correctly set hyphens in ISBNs fixed ISBN-10 -> ISBN-13 conversion activated ISBN hyphen setting in cosmetic_changes
Modified Paths: -------------- trunk/pywikipedia/cosmetic_changes.py trunk/pywikipedia/isbn.py
Modified: trunk/pywikipedia/cosmetic_changes.py =================================================================== --- trunk/pywikipedia/cosmetic_changes.py 2007-09-12 19:54:07 UTC (rev 4259) +++ trunk/pywikipedia/cosmetic_changes.py 2007-09-12 22:12:11 UTC (rev 4260) @@ -25,7 +25,7 @@ all of them, but be careful if you do. """ __version__ = '$Id$' -import wikipedia, pagegenerators +import wikipedia, pagegenerators, isbn import sys import re
@@ -91,6 +91,7 @@ text = self.resolveHtmlEntities(text) text = self.validXhtml(text) text = self.removeUselessSpaces(text) + text = isbn.hyphenateIsbnNumbers(text) if self.debug: wikipedia.showDiff(oldText, text) return text @@ -280,6 +281,7 @@
return text
+ def cleanUpSectionHeaders(self, text): """ For better readability of section header source code, puts a space
Modified: trunk/pywikipedia/isbn.py =================================================================== --- trunk/pywikipedia/isbn.py 2007-09-12 19:54:07 UTC (rev 4259) +++ trunk/pywikipedia/isbn.py 2007-09-12 22:12:11 UTC (rev 4260) @@ -2,52 +2,1162 @@ # -*- coding: utf-8 -*-
""" -This bot goes over multiple pages of the home wiki, and converts all ISBN-10 -codes to the ISBN-13 format. +This script goes over multiple pages of the home wiki, and reports invalid +ISBN numbers.
-This script understands various command-line arguments: +Additionally, it can convert all ISBN-10 codes to the ISBN-13 format, and +correct the ISBN format by placing hyphens.
- -start: used as -start:page_name, specifies that the robot should - go alphabetically through all pages on the home wiki, - starting at the named page. +These command line parameters can be used to specify which pages to work on:
- -file: used as -file:file_name, read a list of pages to treat - from the named textfile. Page titles should be enclosed - in [[double-squared brackets]]. +¶ms;
- -ref: used as -start:page_name, specifies that the robot should - touch all pages referring to the named page. +-namespace:n Number or name of namespace to process. The parameter can be + used multiple times. It works in combination with all other + parameters, except for the -start parameter. If you e.g. + want to iterate over all categories starting at M, use + -start:Category:M.
- -links: used as -links:page_name, specifies that the robot should - touch all pages referred to from the named page. +Furthermore, the following command line parameters are supported:
- -cat: used as -cat:category_name, specifies that the robot should - touch all pages in the named category. +-to13 Converts all ISBN-10 codes to ISBN-13. + NOTE: This needn't be done, as MediaWiki still supports + (and will keep supporting) ISBN-10, and all libraries and + bookstores will most likely do so as well.
-All other parameters will be regarded as a page title; in this case, the bot -will only touch a single page. +-format Corrects the hyphenation. + NOTE: This is in here for testing purposes only. Usually + it's not worth to create an edit for such a minor issue. + The recommended way of doing this is enabling + cosmetic_changes, so that these changes are made on-the-fly + to all pages that are modified.
-##################################################################### -# ATTENTION # -##################################################################### -# The ISBN-13 standard is scheduled to be used as of 2007-01-01. # -# I expect that some libraries and online bookstores will have some # -# problems with the conversion, so we shouldn't start changing # -# ISBNs at new year's eve. # -# As most online bookstores and library catalogs will probably be # -# backwards compatible, we shouldn't change anything before all # -# major sites have managed the conversion process. # -##################################################################### +-always Don't prompt you for each replacement.
-The bot code is unfinished, although the ISBN conversion code seems -to work fine already. --Daniel """
__version__='$Id$'
import wikipedia, pagegenerators, catlib, config -import sys +import sys, re
+docuReplacements = { + '¶ms;': pagegenerators.parameterHelp, +} + +# Summary messages in different languages +msg = { + 'de': 'Bot: Formatiere ISBN', + 'en': 'Robot: Formatting ISBN', +} + +# Maps each group number to the list of its publisher number ranges. +# Taken from http://www.isbn-international.org/converter/ranges.htm +ranges = { + '0': [ # English speaking area + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '89999'), + ('900000', '949999'), + ('9500000', '9999999'), + ], + '1': [ # English speaking area + ('00', '09'), + ('100', '399'), + ('4000', '5499'), + ('55000', '86979'), + ('869800', '998999'), + ], + '2': [ # French speaking area + ('00', '19'), + ('200', '349'), + ('35000', '39999'), + ('400', '699'), + ('7000', '8399'), + ('84000', '89999'), + ('900000', '949999'), + ('9500000', '9999999'), + ], + '3': [ # German speaking area + ('00', '02'), + ('030', '033'), + ('0340', '0369'), + ('03700', '03999'), + ('04', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '89999'), + ('900000', '949999'), + ('9500000', '9999999'), + ], + '4': [ # Japan + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '89999'), + ('900000', '949999'), + ('9500000', '9999999'), + ], + '5': [ # Russian Federation + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '89999'), + ('900000', '909999'), + ('91000', '91999'), + ('9200', '9299'), + ('93000', '94999'), + ('9500', '9799'), + ('98000', '98999'), + ('9900000', '9909999'), + ('9910', '9999'), + ], + '600': [ # Iran + ('00', '09'), + ('100', '499'), + ('5000', '8999'), + ('90000', '99999'), + ], + '601': [ # Kazakhstan + ('00', '19'), + ('200', '699'), + ('7000', '7999'), + ('80000', '84999'), + ('85', '99'), + ], + '602': [ # Indonesia + ('00', '19'), + ('200', '799'), + ('8000', '9499'), + ('95000', '99999'), + ], + '603': [ # Saudi Arabia + ('00', '04'), + ('500', '799'), + ('8000', '8999'), + ('90000', '99999'), + ], + '604': [ # Vietnam + ('0', '4'), + ('50', '89'), + ('900', '979'), + ('9800', '9999'), + ], + '605': [ # Turkey + ('00', '09'), + ('100', '399'), + ('4000', '5999'), + ('60000', '89999'), + ], + '7': [ # China, People's Republic + ('00', '09'), + ('100', '499'), + ('5000', '7999'), + ('80000', '89999'), + ('900000', '999999'), + ], + '80': [ # Czech Republic; Slovakia + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '89999'), + ('900000', '999999'), + ], + '81': [ # India + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '89999'), + ('900000', '999999'), + ], + '82': [ # Norway + ('00', '19'), + ('200', '699'), + ('7000', '8999'), + ('90000', '98999'), + ('990000', '999999'), + ], + '83': [ # Poland + ('00', '19'), + ('200', '599'), + ('60000', '69999'), + ('7000', '8499'), + ('85000', '89999'), + ('900000', '999999'), + ], + '84': [ # Spain + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '89999'), + ('9000', '9199'), + ('920000', '923999'), + ('92400', '92999'), + ('930000', '949999'), + ('95000', '96999'), + ('9700', '9999'), + ], + '85': [ # Brazil + ('00', '19'), + ('200', '599'), + ('60000', '69999'), + ('7000', '8499'), + ('85000', '89999'), + ('900000', '979999'), + ('98000', '99999'), + ], + '86': [ # Serbia and Montenegro + ('00', '29'), + ('300', '599'), + ('6000', '7999'), + ('80000', '89999'), + ('900000', '999999'), + ], + '87': [ # Denmark + ('00', '29'), + ('400', '649'), + ('7000', '7999'), + ('85000', '94999'), + ('970000', '999999'), + ], + '88': [ # Italian speaking area + ('00', '19'), + ('200', '599'), + ('6000', '8499'), + ('85000', '89999'), + ('900000', '949999'), + ('95000', '99999'), + ], + '89': [ # Korea + ('00', '24'), + ('250', '549'), + ('5500', '8499'), + ('85000', '94999'), + ('950000', '999999'), + ], + '90': [ # Netherlands, Belgium (Flemish) + ('00', '19'), + ('200', '499'), + ('5000', '6999'), + ('70000', '79999'), + ('800000', '849999'), + ('8500', '8999'), + ('900000', '909999'), + ('940000', '949999'), + ], + '91': [ # Sweden + ('0', '1'), + ('20', '49'), + ('500', '649'), + ('7000', '7999'), + ('85000', '94999'), + ('970000', '999999'), + ], + '92': [ # International Publishers (Unesco, EU), European Community Organizations + ('0', '5'), + ('60', '79'), + ('800', '899'), + ('9000', '9499'), + ('95000', '98999'), + ('990000', '999999'), + ], + '93': [ # India - no ranges fixed yet + ], + '950': [ # Argentina + ('00', '49'), + ('500', '899'), + ('9000', '9899'), + ('99000', '99999'), + ], + '951': [ # Finland + ('0', '1'), + ('20', '54'), + ('550', '889'), + ('8900', '9499'), + ('95000', '99999'), + ], + '952': [ # Finland + ('00', '19'), + ('200', '499'), + ('5000', '5999'), + ('60', '65'), + ('6600', '6699'), + ('67000', '69999'), + ('7000', '7999'), + ('80', '94'), + ('9500', '9899'), + ('99000', '99999'), + ], + '953': [ # Croatia + ('0', '0'), + ('10', '14'), + ('150', '549'), + ('55000', '59999'), + ('6000', '9499'), + ('95000', '99999'), + ], + '954': [ # Bulgaria + ('00', '29'), + ('300', '799'), + ('8000', '8999'), + ('90000', '92999'), + ('9300', '9999'), + ], + '955': [ # Sri Lanka + ('0', '0'), + ('1000', '1999'), + ('20', '54'), + ('550', '799'), + ('8000', '9499'), + ('95000', '99999'), + ], + '956': [ # Chile + ('00', '19'), + ('200', '699'), + ('7000', '9999'), + ], + '957': [ # Taiwan, China + ('00', '02'), + ('0300', '0499'), + ('05', '19'), + ('2000', '2099'), + ('21', '27'), + ('28000', '30999'), + ('31', '43'), + ('440', '819'), + ('8200', '9699'), + ('97000', '99999'), + ], + '958': [ # Colombia + ('00', '59'), + ('600', '799'), + ('8000', '9499'), + ('95000', '99999'), + ], + '959': [ # Cuba + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ], + '960': [ # Greece + ('00', '19'), + ('200', '659'), + ('6600', '6899'), + ('690', '699'), + ('7000', '8499'), + ('85000', '99999'), + ], + '961': [ # Slovenia + ('00', '19'), + ('200', '599'), + ('6000', '8999'), + ('90000', '94999'), + ], + '962': [ # Hong Kong + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '86999'), + ('8700', '8999'), + ('900', '999'), + ], + '963': [ # Hungary + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '89999'), + ('9000', '9999'), + ], + '964': [ # Iran + ('00', '14'), + ('150', '249'), + ('2500', '2999'), + ('300', '549'), + ('5500', '8999'), + ('90000', '96999'), + ('970', '989'), + ('9900', '9999'), + ], + '965': [ # Israel + ('00', '19'), + ('200', '599'), + ('7000', '7999'), + ('90000', '99999'), + ], + '966': [ # Ukraine + ('00', '19'), + ('2000', '2999'), + ('300', '699'), + ('7000', '8999'), + ('90000', '99999'), + ], + '967': [ # Malaysia + ('00', '29'), + ('300', '499'), + ('5000', '5999'), + ('60', '89'), + ('900', '989'), + ('9900', '9989'), + ('99900', '99999'), + ], + '968': [ # Mexico + ('01', '39'), + ('400', '499'), + ('5000', '7999'), + ('800', '899'), + ('9000', '9999'), + ], + '969': [ # Pakistan + ('0', '1'), + ('20', '39'), + ('400', '799'), + ('8000', '9999'), + ], + '970': [ # Mexico + ('01', '59'), + ('600', '899'), + ('9000', '9099'), + ('91000', '96999'), + ('9700', '9999'), + ], + '971': [ #Philippines? + ('000', '019'), + ('02', '02'), + ('0300', '0599'), + ('06', '09'), + ('10', '49'), + ('500', '849'), + ('8500', '9099'), + ('91000', '99999'), + ], + '972': [ # Portugal + ('0', '1'), + ('20', '54'), + ('550', '799'), + ('8000', '9499'), + ('95000', '99999'), + ], + '973': [ # Romania + ('0', '0'), + ('100', '169'), + ('1700', '1999'), + ('20', '54'), + ('550', '759'), + ('7600', '8499'), + ('85000', '88999'), + ('8900', '9499'), + ('95000', '99999'), + ], + '974': [ # Thailand + ('00', '19'), + ('200', '699'), + ('7000', '8499'), + ('85000', '89999'), + ('90000', '94999'), + ('9500', '9999'), + ], + '975': [ # Turkey + ('00000', '00999'), + ('01', '24'), + ('250', '599'), + ('6000', '9199'), + ('92000', '98999'), + ('990', '999'), + ], + '976': [ # Caribbean Community + ('0', '3'), + ('40', '59'), + ('600', '799'), + ('8000', '9499'), + ('95000', '99999'), + ], + '977': [ # Egypr + ('00', '19'), + ('200', '499'), + ('5000', '6999'), + ('700', '999'), + ], + '978': [ # Nigeria + ('000', '199'), + ('2000', '2999'), + ('30000', '79999'), + ('8000', '8999'), + ('900', '999'), + ], + '979': [ # Indonesia + ('000', '099'), + ('1000', '1499'), + ('15000', '19999'), + ('20', '29'), + ('3000', '3999'), + ('400', '799'), + ('8000', '9499'), + ('95000', '99999'), + ], + '980': [ # Venezuela + ('00', '19'), + ('200', '599'), + ('6000', '9999'), + ], + '981': [ # Singapore + ('00', '19'), + ('200', '299'), + ('3000', '9999'), + ], + '982': [ # South Pacific + ('00', '09'), + ('100', '699'), + ('70', '89'), + ('9000', '9999'), + ], + '983': [ # Malaysia + ('00', '01'), + ('020', '199'), + ('2000', '3999'), + ('40000', '44999'), + ('45', '49'), + ('50', '79'), + ('800', '899'), + ('9000', '9899'), + ('99000', '99999'), + ], + '984': [ # Bangladesh + ('00', '39'), + ('400', '799'), + ('8000', '8999'), + ('90000', '99999'), + ], + '985': [ # Belarus + ('00', '39'), + ('400', '599'), + ('6000', '8999'), + ('90000', '99999'), + ], + '986': [ # Taiwan, China + ('00', '11'), + ('120', '559'), + ('5600', '7999'), + ('80000', '99999'), + ], + '987': [ # Argentina + ('00', '09'), + ('1000', '1999'), + ('20000', '29999'), + ('30', '49'), + ('500', '899'), + ('9000', '9499'), + ('95000', '99999'), + ], + '988': [ # Hongkong + ('00', '16'), + ('17000', '19999'), + ('200', '799'), + ('8000', '9699'), + ('97000', '99999'), + ], + '989': [ # Portugal + ('0', '1'), + ('20', '54'), + ('550', '799'), + ('8000', '9499'), + ('95000', '99999'), + ], + '9937': [ # Nepal + ('0', '2'), + ('30', '49'), + ('500', '799'), + ('8000', '9999'), + ], + '9938': [ # Tunisia + ('00', '79'), + ('800', '949'), + ('9500', '9999'), + ], + '9939': [ # Armenia + ('0', '4'), + ('50', '79'), + ('800', '899'), + ('9000', '9999'), + ], + '9940': [ # Montenegro + ('0', '1'), + ('20', '49'), + ('500', '899'), + ('9000', '9999'), + ], + '9941': [ # Georgia + ('0', '0'), + ('10', '39'), + ('400', '899'), + ('9000', '9999'), + ], + '9942': [ # Ecuador + ('00', '89'), + ('900', '994'), + ('9950', '9999'), + ], + '9943': [ # Uzbekistan + ('00', '29'), + ('300', '399'), + ('4000', '9999'), + ], + '9944': [ # Turkey + ('0', '2'), + ('300', '499'), + ('5000', '5999'), + ('60', '89'), + ('900', '999'), + ], + '9945': [ # Dominican Republic + ('00', '00'), + ('010', '079'), + ('08', '39'), + ('400', '569'), + ('57', '57'), + ('580', '849'), + ('8500', '9999'), + ], + '9946': [ # Korea, P.D.R. + ('0', '1'), + ('20', '39'), + ('400', '899'), + ('9000', '9999'), + ], + '9947': [ # Algeria + ('0', '1'), + ('20', '79'), + ('800', '999'), + ], + '9948': [ # United Arab Emirates + ('00', '39'), + ('400', '849'), + ('8500', '9999'), + ], + '9949': [ # Estonia + ('0', '0'), + ('10', '39'), + ('400', '899'), + ('9000', '9999'), + ], + '9950': [ # Palestine + ('00', '29'), + ('300', '840'), + ('8500', '9999'), + ], + '9951': [ # Kosova + ('00', '39'), + ('400', '849'), + ('8500', '9999'), + ], + '9952': [ # Azerbaijan + ('0', '1'), + ('20', '39'), + ('400', '799'), + ('8000', '9999'), + ], + '9953': [ # Lebanon + ('0', '0'), + ('10', '39'), + ('400', '599'), + ('60', '89'), + ('9000', '9999'), + ], + '9954': [ # Morocco + ('0', '1'), + ('20', '39'), + ('400', '799'), + ('8000', '9999'), + ], + '9955': [ # Lithuania + ('00', '39'), + ('400', '929'), + ('9300', '9999'), + ], + '9956': [ # Cameroon + ('0', '0'), + ('10', '39'), + ('400', '899'), + ('9000', '9999'), + ], + '9957': [ # Jordan + ('00', '39'), + ('400', '699'), + ('70', '84'), + ('8500', '9999'), + ], + '9958': [ # Bosnia and Herzegovina + ('0', '0'), + ('10', '49'), + ('500', '899'), + ('9000', '9999'), + ], + '9959': [ # Libya + ('0', '1'), + ('20', '79'), + ('800', '949'), + ('9500', '9999'), + ], + '9960': [ # Saudi Arabia + ('00', '59'), + ('600', '899'), + ('9000', '9999'), + ], + '9961': [ # Algeria + ('0', '2'), + ('30', '69'), + ('700', '949'), + ('9500', '9999'), + ], + '9962': [ # Panama + ('00', '54'), + ('5500', '5599'), + ('56', '59'), + ('600', '849'), + ('8500', '9999'), + ], + '9963': [ # Cyprus + ('0', '2'), + ('30', '54'), + ('550', '749'), + ('7500', '9999'), + ], + '9964': [ # Ghana + ('0', '6'), + ('70', '94'), + ('950', '999'), + ], + '9965': [ # Kazakhstan + ('00', '39'), + ('400', '899'), + ('9000', '9999'), + ], + '9966': [ # Kenya + ('00', '69'), + ('7000', '7499'), + ('750', '959'), + ('9600', '9999'), + ], + '9967': [ # Kyrgyzstan + ('00', '39'), + ('400', '899'), + ('9000', '9999'), + ], + '9968': [ # Costa Rica + ('00', '49'), + ('500', '939'), + ('9400', '9999'), + ], + '9970': [ # Uganda + ('00', '39'), + ('400', '899'), + ('9000', '9999'), + ], + '9971': [ # Singapore + ('0', '5'), + ('60', '89'), + ('900', '989'), + ('9900', '9999'), + ], + '9972': [ # Peru + ('00', '09'), + ('1', '1'), + ('200', '249'), + ('2500', '2999'), + ('30', '59'), + ('600', '899'), + ('9000', '9999'), + ], + '9973': [ # Tunisia + ('0', '05'), + ('060', '089'), + ('0900', '0999'), + ('10', '69'), + ('700', '969'), + ('9700', '9999'), + ], + '9974': [ # Uruguay + ('0', '2'), + ('30', '54'), + ('550', '749'), + ('7500', '9499'), + ('95', '99'), + ], + '9975': [ # Moldova + ('0', '0'), + ('100', '399'), + ('4000', '4499'), + ('45', '89'), + ('900', '949'), + ('9500', '9999'), + ], + '9976': [ # Tanzania + ('0', '5'), + ('60', '89'), + ('900', '989'), + ('9990', '9999'), + ], + '9977': [ # Costa Rica + ('00', '89'), + ('900', '989'), + ('9900', '9999'), + ], + '9978': [ # Ecuador + ('00', '29'), + ('300', '399'), + ('40', '94'), + ('950', '989'), + ('9900', '9999'), + ], + '9979': [ # Iceland + ('0', '4'), + ('50', '64'), + ('650', '659'), + ('66', '75'), + ('760', '899'), + ('9000', '9999'), + ], + '9980': [ # Papua New Guinea + ('0', '3'), + ('40', '89'), + ('900', '989'), + ('9900', '9999'), + ], + '9981': [ # Morocco + ('00', '09'), + ('100', '159'), + ('1600', '1999'), + ('20', '79'), + ('800', '949'), + ('9500', '9999'), + ], + '9982': [ # Zambia + ('00', '79'), + ('800', '989'), + ('9900', '9999'), + ], + '9983': [ # Gambia + ('80', '94'), + ('950', '989'), + ('9900', '9999'), + ], + '9984': [ # Latvia + ('00', '49'), + ('500', '899'), + ('9000', '9999'), + ], + '9985': [ # Estonia + ('0', '4'), + ('50', '79'), + ('800', '899'), + ('9000', '9999'), + ], + '9986': [ # Lithuania + ('00', '39'), + ('400', '899'), + ('9000', '9399'), + ('940', '969'), + ('97', '99'), + ], + '9987': [ # Tanzania + ('00', '39'), + ('400', '879'), + ('8800', '9999'), + ], + '9988': [ # Ghana + ('0', '2'), + ('30', '54'), + ('550', '749'), + ('7500', '9999'), + ], + '9989': [ # Macedonia + ('0', '0'), + ('100', '199'), + ('2000', '2999'), + ('30', '59'), + ('600', '949'), + ('9500', '9999'), + ], + '99901': [ # Bahrain + ('00', '49'), + ('500', '799'), + ('80', '99'), + ], + '99902': [ # Gabon - no ranges fixed yet + ], + '99903': [ # Mauritius + ('0', '1'), + ('20', '89'), + ('900', '999'), + ], + '99904': [ # Netherlands Antilles; Aruba, Neth. Ant + ('0', '5'), + ('60', '89'), + ('900', '999'), + ], + '99905': [ # Bolivia + ('0', '3'), + ('40', '79'), + ('800', '999'), + ], + '99906': [ # Kuwait + ('0', '2'), + ('30', '59'), + ('600', '699'), + ('70', '89'), + ('9', '9'), + ], + '99908': [ # Malawi + ('0', '0'), + ('10', '89'), + ('900', '999'), + ], + '99909': [ # Malta + ('0', '3'), + ('40', '94'), + ('950', '999'), + ], + '99910': [ # Sierra Leone + ('0', '2'), + ('30', '89'), + ('900', '999'), + ], + '99911': [ # Lesotho + ('00', '59'), + ('600', '999'), + ], + '99912': [ # Botswana + ('0', '3'), + ('400', '599'), + ('60', '89'), + ('900', '999'), + ], + '99913': [ # Andorra + ('0', '2'), + ('30', '35'), + ('600', '604'), + ], + '99914': [ # Suriname + ('0', '4'), + ('50', '89'), + ('900', '949'), + ], + '99915': [ # Maldives + ('0', '4'), + ('50', '79'), + ('800', '999'), + ], + '99916': [ # Namibia + ('0', '2'), + ('30', '69'), + ('700', '999'), + ], + '99917': [ # Brunei Darussalam + ('0', '2'), + ('30', '89'), + ('900', '999'), + ], + '99918': [ # Faroe Islands + ('0', '3'), + ('40', '79'), + ('800', '999'), + ], + '99919': [ # Benin + ('0', '2'), + ('40', '69'), + ('900', '999'), + ], + '99920': [ # Andorra + ('0', '4'), + ('50', '89'), + ('900', '999'), + ], + '99921': [ # Qatar + ('0', '1'), + ('20', '69'), + ('700', '799'), + ('8', '8'), + ('90', '99'), + ], + '99922': [ # Guatemala + ('0', '3'), + ('40', '69'), + ('700', '999'), + ], + '99923': [ # El Salvador + ('0', '1'), + ('20', '79'), + ('800', '999'), + ], + '99924': [ # Nicaragua + ('0', '2'), + ('30', '79'), + ('800', '999'), + ], + '99925': [ # Paraguay + ('0', '3'), + ('40', '79'), + ('800', '999'), + ], + '99926': [ # Honduras + ('0', '0'), + ('10', '59'), + ('600', '999'), + ], + '99927': [ # Albania + ('0', '2'), + ('30', '59'), + ('600', '999'), + ], + '99928': [ # Georgia + ('0', '0'), + ('10', '79'), + ('800', '999'), + ], + '99929': [ # Mongolia + ('0', '4'), + ('50', '79'), + ('800', '999'), + ], + '99930': [ # Armenia + ('0', '4'), + ('50', '79'), + ('800', '999'), + ], + '99931': [ # Seychelles + ('0', '4'), + ('50', '79'), + ('800', '999'), + ], + '99932': [ # Malta + ('0', '0'), + ('10', '59'), + ('600', '699'), + ('7', '7'), + ('80', '99'), + ], + '99933': [ # Nepal + ('0', '2'), + ('30', '59'), + ('600', '999'), + ], + '99934': [ # Dominican Republic + ('0', '1'), + ('20', '79'), + ('800', '999'), + ], + '99935': [ # Haiti + ('0', '2'), + ('7', '8'), + ('30', '59'), + ('600', '699'), + ('90', '99'), + ], + '99936': [ # Bhutan + ('0', '0'), + ('10', '59'), + ('600', '999'), + ], + '99937': [ # Macau + ('0', '1'), + ('20', '59'), + ('600', '999'), + ], + '99938': [ # Srpska + ('0', '1'), + ('20', '59'), + ('600', '899'), + ('90', '99'), + ], + '99939': [ # Guatemala + ('0', '5'), + ('60', '89'), + ('900', '999'), + ], + '99940': [ # Georgia + ('0', '0'), + ('10', '69'), + ('700', '999'), + ], + '99941': [ # Armenia + ('0', '2'), + ('30', '79'), + ('800', '999'), + ], + '99942': [ # Sudan + ('0', '4'), + ('50', '79'), + ('800', '999'), + ], + '99943': [ # Alsbania + ('0', '2'), + ('30', '59'), + ('600', '999'), + ], + '99944': [ # Ethiopia + ('0', '4'), + ('50', '79'), + ('800', '999'), + ], + '99945': [ # Namibia + ('0', '5'), + ('60', '89'), + ('900', '999'), + ], + '99946': [ # Nepal + ('0', '2'), + ('30', '59'), + ('600', '999'), + ], + '99947': [ # Tajikistan + ('0', '2'), + ('30', '69'), + ('700', '999'), + ], + '99948': [ # Eritrea + ('0', '4'), + ('50', '79'), + ('800', '999'), + ], + '99949': [ # Mauritius + ('0', '1'), + ('20', '89'), + ('900', '999'), + ], + '99950': [ # Cambodia + ('0', '4'), + ('50', '79'), + ('800', '999'), + ], + '99951': [ # Congo - no ranges fixed yet + ], + '99952': [ # Mali + ('0', '4'), + ('50', '79'), + ('800', '999'), + ], + '99953': [ # Paraguay + ('0', '2'), + ('30', '79'), + ('800', '999'), + ], + '99954': [ # Bolivia + ('0', '2'), + ('30', '69'), + ('700', '999'), + ], + '99955': [ # Srpska + ('0', '1'), + ('20', '59'), + ('600', '899'), + ('90', '99'), + ], + '99956': [ # Albania + ('00', '59'), + ('600', '999'), + ], +} + class IsbnBot: def __init__(self, generator): self.generator = generator @@ -65,64 +1175,65 @@ except wikipedia.LockedPage: print "Page %s is locked?!" % page.aslink()
-def mainBackup(): - #page generator - gen = None - # If the user chooses to work on a single page, this temporary array is - # used to read the words from the page title. The words will later be - # joined with spaces to retrieve the full title. - pageTitle = [] - for arg in wikipedia.handleArgs(): - if arg.startswith('-start:'): - page = wikipedia.Page(wikipedia.getSite(), arg[7:]) - gen = pagegenerators.AllpagesPageGenerator(page.titleWithoutNamespace(), namespace = page.namespace()) - elif arg.startswith('-ref:'): - referredPage = wikipedia.Page(wikipedia.getSite(), arg[5:]) - gen = pagegenerators.ReferringPageGenerator(referredPage) - elif arg.startswith('-links:'): - linkingPage = wikipedia.Page(wikipedia.getSite(), arg[7:]) - gen = pagegenerators.LinkedPageGenerator(linkingPage) - elif arg.startswith('-file:'): - gen = pagegenerators.TextfilePageGenerator(arg[6:]) - elif arg.startswith('-cat:'): - cat = catlib.Category(wikipedia.getSite(), arg[5:]) - gen = pagegenerators.CategorizedPageGenerator(cat) - # TODO: add -xml: option - else: - pageTitle.append(arg)
- if pageTitle: - # work on a single page - page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle)) - gen = iter([page]) - if not gen: - wikipedia.showHelp('isbn') - else: - preloadingGen = pagegenerators.PreloadingGenerator(gen) - bot = IsbnBot(preloadingGen) - bot.run() - class InvalidIsbnException(wikipedia.Error): """Invalid ISBN""" + def __init__(self, message): + self.message = message
-class ISBN13: - def __init__(self, isbn10): +class ISBN: + """ + Abstract superclass + """ + + def format(self): """ - Creates a 13-digit ISBN from a 10-digit ISBN by prefixing the GS1 - prefix '978' and recalculating the checksum. - The hyphenation structure is taken from the format of the original - ISBN number. - TODO: Find out if there is a feasible way to fix hyphenation mistakes. - Seems to be difficult, as each country/region can set its own - hyphenation rules. However, the converter at isbn.org knows where to - place hyphens. + Puts hyphens into this ISBN number. """ - self.isbn10 = isbn10 - self.isbn10.checkValidity() - self.code = '978-' + self.isbn10.code[:-1] - cs = self.calculateChecksum() - self.code += str(cs) - + result = '' + rest = '' + for digit in self.digits(): + rest += str(digit) + # Determine the prefix (if any) + for prefix in self.possiblePrefixes(): + if rest.startswith(prefix): + result += prefix + '-' + rest = rest[len(prefix):] + + # Determine the group + for groupNumber in ranges.iterkeys(): + if rest.startswith(groupNumber): + result += groupNumber + '-' + rest = rest[len(groupNumber):] + publisherRanges = ranges[groupNumber] + break + else: + raise InvalidIsbnException('ISBN %s: group number unknown.' % self.code) + + # Determine the publisher + for (start, end) in publisherRanges: + length = len(start) # NOTE: start and end always have equal length + if rest[:length] > start and rest[:length] <= end: + result += rest[:length] + '-' + rest = rest[length:] + break + else: + raise InvalidIsbnException('ISBN %s: publisher number unknown.' % self.code) + + # The rest is the item number and the 1-digit checksum. + result += rest[:-1] + '-' + rest[-1] + self.code = result + +class ISBN13(ISBN): + def __init__(self, code, checksumMissing = False): + self.code = code + if checksumMissing: + self.code += str(self.calculateChecksum()) + self.checkValidity() + + def possiblePrefixes(self): + return ['978', '979'] + def digits(self): """ Returns a list of the digits in the ISBN code. @@ -132,24 +1243,32 @@ if c.isdigit(): result.append(int(c)) elif c != '-': - raise InvalidIsbnException('The ISBN contains invalid characters.') + raise InvalidIsbnException('The ISBN %s contains invalid characters.' % self.code) return result - + + def checkValidity(self): + if len(self.digits()) != 13: + raise InvalidIsbnException('The ISBN %s is not 13 digits long.' % self.code) + if self.calculateChecksum() != self.digits()[-1]: + raise InvalidIsbnException('The ISBN checksum of %s is incorrect.' % self.code) + def calculateChecksum(self): # See http://en.wikipedia.org/wiki/ISBN#Check_digit_in_ISBN_13 sum = 0 - for i in range(0, len(self.digits()), 2): + for i in range(0, 13 - 1, 2): sum += self.digits()[i] - for i in range(1, len(self.digits()), 2): + for i in range(1, 13 - 1, 2): sum += 3 * self.digits()[i] - return 10 - (sum % 10) - - -class ISBN10: + return (10 - (sum % 10)) % 10 + +class ISBN10(ISBN): def __init__(self, code): self.code = code self.checkValidity()
+ def possiblePrefixes(self): + return [] + def digits(self): """ Returns a list of the digits and Xs in the ISBN code. @@ -159,7 +1278,7 @@ if c.isdigit() or c == 'X': result.append(c) elif c != '-': - raise InvalidIsbnException('The ISBN contains invalid characters.') + raise InvalidIsbnException('The ISBN %s contains invalid characters.' % self.code) return result
def checkChecksum(self): @@ -177,24 +1296,192 @@ lastDigit = self.digits()[-1] #print lastDigit if not ((checksum == 10 and lastDigit == 'X') or (lastDigit.isdigit() and checksum == int(lastDigit))): - raise InvalidIsbnException('The checksum is incorrect.') + raise InvalidIsbnException('The ISBN checksum of %s is incorrect.' % self.code)
def checkValidity(self): if len(self.digits()) != 10: - raise InvalidIsbnException('The ISBN is not 10 digits long.') + raise InvalidIsbnException('The ISBN %s is not 10 digits long.' % self.code) if 'X' in self.digits()[:-1]: - raise InvalidIsbnException('X is only allowed at the end of the ISBN.') + raise InvalidIsbnException('ISBN %s: X is only allowed at the end of the ISBN.' % self.code) self.checkChecksum()
+ def toISBN13(self): + """ + Creates a 13-digit ISBN from this 10-digit ISBN by prefixing the GS1 + prefix '978' and recalculating the checksum. + The hyphenation structure is taken from the format of the original + ISBN number. + """ + code = '978-' + self.code[:-1] + + #cs = self.calculateChecksum() + #code += str(cs) + return ISBN13(code, checksumMissing = True) + +def getIsbn(code): + try: + i = ISBN13(code) + except InvalidIsbnException, e13: + try: + i = ISBN10(code) + except InvalidIsbnException, e10: + raise InvalidIsbnException(u'ISBN-13: %s / ISBN-10: %s' % (e13.message, e10.message)) + return i + +def _hyphenateIsbnNumber(match): + """ + Helper function to deal with a single ISBN + """ + code = match.group('code') + try: + i = getIsbn(code) + except InvalidIsbnException: + # don't change + return code + i.format() + return i.code + +def hyphenateIsbnNumbers(text): + isbnR = re.compile(r'(?<=ISBN )(?P<code>[\d-]+X?)') + text = isbnR.sub(_hyphenateIsbnNumber, text) + return text + +def _isbn10toIsbn13(match): + """ + Helper function to deal with a single ISBN + """ + code = match.group('code') + try: + i = getIsbn(code) + except InvalidIsbnException: + # don't change + return code + i.format() + return i.code + +def convertIsbn10toIsbn13(text): + isbnR = re.compile(r'(?<=ISBN )(?P<code>[\d-]+X?)') + text = isbnR.sub(_isbn10toIsbn13, text) + return text + +class IsbnBot: + + def __init__(self, generator, to13 = False, format = False, always = False): + self.generator = generator + self.to13 = to13 + self.format = format + self.always = always + self.isbnR = re.compile(r'(?<=ISBN )(?P<code>[\d-]+X?)') + + def treat(self, page): + try: + oldText = page.get() + for match in self.isbnR.finditer(oldText): + code = match.group('code') + try: + getIsbn(code) + except InvalidIsbnException, e: + wikipedia.output(e.message) + + newText = oldText + if self.to13: + newText = self.isbnR.sub(_isbn10toIsbn13, newText) + if self.format: + newText = self.isbnR.sub(_hyphenateIsbnNumber, newText) + self.save(page, newText) + except wikipedia.NoPage: + wikipedia.output(u"Page %s does not exist?!" % page.aslink()) + except wikipedia.IsRedirectPage: + wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink()) + except wikipedia.LockedPage: + wikipedia.output(u"Page %s is locked?!" % page.aslink()) + + def save(self, page, text): + if text != page.get(): + # Show the title of the page we're working on. + # Highlight the title in purple. + wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) + wikipedia.showDiff(page.get(), text) + if not self.always: + choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'Always yes'], ['y', 'N', 'a'], 'N') + if choice == 'n': + return + elif choice == 'a': + self.always = True + + if self.always: + try: + page.put(text) + except wikipedia.EditConflict: + wikipedia.output(u'Skipping %s because of edit conflict' % (page.title(),)) + except wikipedia.SpamfilterError, e: + wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) + except wikipedia.LockedPage: + wikipedia.output(u'Skipping %s (locked page)' % (page.title(),)) + else: + # Save the page in the background. No need to catch exceptions. + page.put_async(text) + + + def run(self): + comment = wikipedia.translate(wikipedia.getSite(), msg) + wikipedia.setAction(comment) + + for page in self.generator: + self.treat(page) + + def main(): - # test code - isbn10 = ISBN10(u'3-86640-001-2') - isbn13 = ISBN13(isbn10) - print isbn13.code + #page generator + gen = None + # This temporary array is used to read the page title if one single + # page to work on is specified by the arguments. + pageTitle = [] + # Which namespaces should be processed? + # default to [] which means all namespaces will be processed + namespaces = [] + # This factory is responsible for processing command line arguments + # that are also used by other scripts and that determine on which pages + # to work on. + genFactory = pagegenerators.GeneratorFactory() + # Never ask before changing a page + always = False + to13 = False + format = False
+ for arg in wikipedia.handleArgs(): + if arg.startswith('-namespace:'): + try: + namespaces.append(int(arg[11:])) + except ValueError: + namespaces.append(arg[11:]) + elif arg == '-always': + always = True + elif arg == '-to13': + to13 = True + elif arg == '-format': + format = True + else: + generator = genFactory.handleArg(arg) + if generator: + gen = generator + else: + pageTitle.append(arg) + + if pageTitle: + page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle)) + gen = iter([page]) + if not gen: + wikipedia.showHelp('isbn') + else: + if namespaces != []: + gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) + preloadingGen = pagegenerators.PreloadingGenerator(gen) + bot = IsbnBot(preloadingGen, to13 = to13, format = format, always = always) + bot.run() + if __name__ == "__main__": try: main() finally: wikipedia.stopme() -