Hi,
I am new to the list, I work on huwiki and have the following problem:
When I write te command
solve_disambiguation.py -just -pos:"39-es főút" "39-es számú főút"
the bot keeps answering:
Possibility 39-es fńút does not actually exist. Use it anyway? ([y]es, [N]o)
Getting references to [[39-es számú fńút]]
Found 0 references.
Note that ő has become ń.
What can I do? Ő is a Hungarian letter, and I really-really need it. :-)
Bináris
Revision: 4957
Author: filnik
Date: 2008-01-31 17:23:45 +0000 (Thu, 31 Jan 2008)
Log Message:
-----------
Some bugfixes, some rewrite, adding some comments (but the script still needs a better docu)
Modified Paths:
--------------
trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================
--- trunk/pywikipedia/checkimages.py 2008-01-31 16:32:39 UTC (rev 4956)
+++ trunk/pywikipedia/checkimages.py 2008-01-31 17:23:45 UTC (rev 4957)
@@ -315,6 +315,10 @@
def __init__(self, site, logFulNumber = 25000):
self.site = site
self.logFulNumber = logFulNumber
+ self.settings = wikipedia.translate(site, page_with_settings)
+ self.rep_page = wikipedia.translate(site, report_page)
+ self.rep_text = wikipedia.translate(site, report_text)
+ self.com = wikipedia.translate(site, comm10)
def general(self, newtext, image, notification, head, botolist):
""" This class can be called for two reason. So I need two different __init__, one with common data
and another with the data that I required... maybe it can be added on the other function, but in this way
@@ -349,7 +353,7 @@
if imagedata == list():
wikipedia.output(u"Seems that %s hasn't the image at all, but there is something in the description..." % self.image)
repme = "\n*[[:Image:%s]] seems to have problems ('''no data found in the image''')"
- self.report_image(rep_page, self.image, com, repme)
+ self.report_image(self.image, self.rep_page, self.com, repme)
# We have a problem! Report and exit!
return False
try:
@@ -358,7 +362,7 @@
wikipedia.output(u"Seems that %s hasn't the image at all, but there is something in the description..." % self.image)
repme = "\n*[[:Image:%s]] seems to have problems ('''no data found in the image''')"
# We have a problem! Report and exit!
- self.report_image(rep_page, self.image, com, repme)
+ self.report_image(self.image, self.rep_page, self.com, repme)
return False
luser = wikipedia.url2link(nick, self.site, self.site)
pagina_discussione = "%s:%s" % (self.site.namespace(3), luser)
@@ -420,7 +424,7 @@
elif second_text == False:
talk_page.put(testoattuale + head + notification, comment = commentox, minorEdit = False)
- def untaggedGenerator(self, untaggedProject, rep_page, com):
+ def untaggedGenerator(self, untaggedProject):
lang = untaggedProject.split('.', 1)[0]
project = '.%s' % untaggedProject.split('.', 1)[1]
if lang == 'commons':
@@ -457,6 +461,7 @@
#continue
def checkImage(self, image):
+ self.image = image
# Search regular expression to find links like this (and the class attribute is optional too)
# title="Immagine:Nvidia.jpg"
wikipedia.output(u'Checking if %s is on commons...' % image)
@@ -472,17 +477,22 @@
return False
elif 'stemma' in image.lower() and self.site.lang == 'it':
wikipedia.output(u'%s has "stemma" inside, means that it\'s ok.' % image)
- return False
+ return True # Problems? No, it's only not on commons but the image needs a check
else:
repme = "\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]]"
- self.report_image(rep_page, image, com, repme)
+ self.report_image(self.image, self.rep_page, self.com, repme)
# Problems? No, return True
return True
else:
# Problems? No, return True
return True
-
- def report_image(self, rep_page, image, com, rep):
+ def report_image(self, image, rep_page = None, com = None, rep_text = None):
+ if rep_page == None:
+ rep_page = self.rep_page
+ if com == None:
+ com = self.com
+ if rep_text == None:
+ rep_text = self.rep_text
another_page = wikipedia.Page(self.site, rep_page)
if another_page.exists():
@@ -498,10 +508,10 @@
y = n.search(text_get, pos)
if y == None:
# Adding the log :)
- if "\'\'\'Commons\'\'\'" in rep:
- rep_text = rep % (image, image)
+ if "\'\'\'Commons\'\'\'" in rep_text:
+ rep_text = rep_text % (image, image)
else:
- rep_text = rep % image
+ rep_text = rep_text % image
another_page.put(text_get + rep_text, comment = com, minorEdit = False)
wikipedia.output(u"...Reported...")
reported = True
@@ -511,11 +521,11 @@
reported = False
return reported
- def takesettings(self, settings):
+ def takesettings(self):
pos = 0
- if settings == None: lista = None
+ if self.settings == None: lista = None
else:
- x = wikipedia.Page(self.site, settings)
+ x = wikipedia.Page(self.site, self.settings)
lista = list()
try:
testo = x.get()
@@ -724,10 +734,6 @@
nn = wikipedia.translate(site, nothing_notification)
dels = wikipedia.translate(site, del_comm)
smwl = wikipedia.translate(site, second_message_without_license)
- settings = wikipedia.translate(site, page_with_settings)
- rep_page = wikipedia.translate(site, report_page)
- rep_text = wikipedia.translate(site, report_text)
- com = wikipedia.translate(site, comm10)
TextFind = wikipedia.translate(site, txt_find)
hiddentemplate = wikipedia.translate(site, HiddenTemplate)
# A template as {{en is not a license! Adding also them in the whitelist template...
@@ -768,7 +774,7 @@
mainClass = main(site)
# Untagged is True? Let's take that generator
if untagged == True:
- generator = mainClass.untaggedGenerator(projectUntagged, rep_page, com)
+ generator = mainClass.untaggedGenerator(projectUntagged)
normal = False # Ensure that normal is False
# Normal True? Take the default generator
if normal == True:
@@ -790,7 +796,7 @@
# Ok, We (should) have a generator, so let's go on.
try:
# Take the additional settings for the Project
- tupla_written = mainClass.takesettings(settings)
+ tupla_written = mainClass.takesettings()
except wikipedia.Error:
# Error? Settings = None
wikipedia.output(u'Problems with loading the settigs, run without them.')
@@ -798,19 +804,12 @@
some_problem = False
# Ensure that if the list given is empty it will be converted to "None"
# (but it should be already done in the takesettings() function)
- if tupla_written == []:
- tupla_written = None
- if tupla_written != None:
- wikipedia.output(u'\t >> Loaded the real-time page... <<')
- # Save the settings not to lose them (FixMe: Make that part better)
- # The name is to avoid mistakes when the same bot is run in multiple projects.
- filename = "settings-%s.data" % str(site).replace(':', '-')
- f = file(filename, 'w')
- cPickle.dump(tupla_written, f)
- f.close()
- else:
- # No settings found, No problem, continue.
- wikipedia.output(u'\t >> No additional settings found! <<')
+ if tupla_written == []: tupla_written = None
+ # Real-Time page loaded
+ if tupla_written != None: wikipedia.output(u'\t >> Loaded the real-time page... <<')
+ # No settings found, No problem, continue.
+ else: wikipedia.output(u'\t >> No additional settings found! <<')
+ # Not the main, but the most important loop.
for image in generator:
# If I don't inizialize the generator, wait part and skip part are useless
if wait:
@@ -830,11 +829,13 @@
if skip == True:
# If the images to skip are more the images to check, make them the same number
if skip_number > limit: skip_number = limit
+ # Print a starting message only if no images has been skipped
if skip_list == []:
if skip_number == 1:
wikipedia.output(u'Skipping the first image:\n')
else:
wikipedia.output(u'Skipping the first %s images:\n' % skip_number)
+ # If we still have pages to skip:
if len(skip_list) < skip_number:
wikipedia.output(u'Skipping %s...' % imageName)
skip_list.append(imageName)
@@ -843,25 +844,24 @@
skip = False
continue
else:
- wikipedia.output('1\n')
+ wikipedia.output('') # Print a blank line.
skip = False
- elif skip_list == []:
+ elif skip_list == []: # Skip must be false if we are here but
+ # the user has set 0 as images to skip
wikipedia.output(u'\t\t>> No images to skip...<<')
skip_list.append('skip = Off') # Only to print it once
+ # Check on commons if there's already an image with the same name
if commonsActive == True:
response = mainClass.checkImage(imageName)
if response == False:
continue
- if tupla_written != None:
- f = file(filename)
- tuplaList = cPickle.load(f)
- parentesi = False
+ parentesi = False # parentesi are these in italian: { ( ) } []
delete = False
tagged = False
- extension = imageName.split('.')[-1]
+ extension = imageName.split('.')[-1] # get the extension from the image's name
# Page => ImagePage
p = wikipedia.ImagePage(site, image.title())
- # Skip deleted images
+ # Get the text in the image (called g)
try:
g = p.get()
except wikipedia.NoPage:
@@ -870,33 +870,40 @@
except wikipedia.IsRedirectPage:
wikipedia.output(u"The file description for %s is a redirect?!" % imageName )
continue
+ # Is the image already tagged? If yes, no need to double-check, skip
for i in TextFind:
+ # If there are {{ use regex, otherwise no (if there's not the {{ may not be a template
+ # and the regex will be wrong)
if '{{' in i:
regexP = re.compile('\{\{(?:template|)%s ?(?:\||\n|\}) ?' % i.split('{{')[1].replace(' ', '[ _]'), re.I)
result = regexP.findall(g)
if result != []:
tagged = True
elif i.lower() in g:
- tagged = True
+ tagged = True
+ # Deleting the useless template from the description (before adding something
+ # in the image the original text will be reloaded, don't worry).
for l in hiddentemplate:
if tagged == False:
res = re.findall(r'\{\{(?:[Tt]emplate:|)%s(?: \n|\||\n|\})' % l.lower(), g.lower())
if res != []:
- #print res
wikipedia.output(u'A white template found, skipping the template...')
- # I don't delete the template, because if there is something to change the image page
- # will be reloaded. I delete it only for the next check part.
- if l != '' and l != ' ':
+ if l != '' and l != ' ': # Check that l is not nothing or a space
+ # Deleting! (replace the template with nothing)
g = g.lower().replace('{{%s' % l, '')
- for a_word in something:
+ for a_word in something: # something is the array with {{, MIT License and so on.
if a_word in g:
+ # There's a template, probably a license (or I hope so)
parentesi = True
+ # Is the extension allowed? (is it an image or f.e. a .xls file?)
for parl in notallowed:
if parl.lower() in extension.lower():
delete = True
- some_problem = False
+ some_problem = False # If it has "some_problem" it must check
+ # the additional settings.
+ # if tupla_writte, use addictional settings
if tupla_written != None:
- for tupla in tuplaList:
+ for tupla in tupla_written:
name = tupla[1]
find_tipe = tupla[2]
find = tupla[3]
@@ -916,7 +923,6 @@
text = text % imageName
mexCatched = tupla[8]
wikipedia.setAction(summary)
- del tupla[0:8]
for k in find_list:
if find_tipe.lower() == 'findonly':
if k.lower() == g.lower():
@@ -938,9 +944,12 @@
summary_used = summary
mex_used = mexCatched
continue
+ # If the image exists (maybe it has been deleting during the oder
+ # checking parts or something, who knows? ;-))
if p.exists():
# Here begins the check block.
if tagged == True:
+ # Tagged? Yes, skip.
printWithTimeZone(u'%s is already tagged...' % imageName)
continue
if some_problem == True:
@@ -951,7 +960,7 @@
if mex_used.lower() == 'default':
mex_used = unvertext
if imagestatus_used == False:
- reported = mainClass.report_image(rep_page, imageName, com, rep_text)
+ reported = mainClass.report_image(imageName)
else:
reported = True
if reported == True:
@@ -1014,4 +1023,3 @@
wikipedia.stopme()
finally:
wikipedia.stopme()
- sys.exit() # Be sure that the Bot will stop
Revision: 4956
Author: filnik
Date: 2008-01-31 16:32:39 +0000 (Thu, 31 Jan 2008)
Log Message:
-----------
I Will Never Comment again -.-' it breaks the code. bugfixing
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2008-01-31 16:15:16 UTC (rev 4955)
+++ trunk/pywikipedia/wikipedia.py 2008-01-31 16:32:39 UTC (rev 4956)
@@ -700,7 +700,7 @@
# non-existant pages
# Check also the div class because if the language is not english
# the bot can not seeing that the page is blocked.
- elif text.find(self.site().mediawiki_message('badaccess')) != -1 or \ # continue below
+ elif text.find(self.site().mediawiki_message('badaccess')) != -1 or \
text.find("<div class=\"permissions-errors\">") != -1:
raise NoPage(self.site(), self.aslink(forceInterwiki = True))
else:
Revision: 4955
Author: filnik
Date: 2008-01-31 16:15:16 +0000 (Thu, 31 Jan 2008)
Log Message:
-----------
Forgot: I had another fix to do, the pid issue continues, fixing as already fixed above
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2008-01-31 16:10:17 UTC (rev 4954)
+++ trunk/pywikipedia/wikipedia.py 2008-01-31 16:15:16 UTC (rev 4955)
@@ -2773,14 +2773,10 @@
line = line.split(' ')
pid = int(line[0])
ptime = int(line[1].split('.')[0])
- except (IndexError, ValueError):
- # I go a lot of crontab errors because line is not a number.
- # Better to prevent that. If you find out the error, feel free
- # to fix it better.
- pid = 1
- ptime = time.time()
- if now - ptime <= self.releasepid and pid != self.pid:
- processes[pid] = ptime
+ if now - ptime <= self.releasepid and pid != self.pid:
+ processes[pid] = ptime
+ except (IndexError,ValueError):
+ pass # Sometimes the file gets corrupted - ignore that line
f = open(self.logfn(), 'w')
for p in processes.keys():
f.write(str(p)+' '+str(processes[p])+'\n')
Revision: 4954
Author: filnik
Date: 2008-01-31 16:10:17 +0000 (Thu, 31 Jan 2008)
Log Message:
-----------
Adding another little check for cascading protection. Make sure that if the language in preferences changes, the bot won't block.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2008-01-31 15:58:27 UTC (rev 4953)
+++ trunk/pywikipedia/wikipedia.py 2008-01-31 16:10:17 UTC (rev 4954)
@@ -698,7 +698,10 @@
raise NoPage(self.site(), self.aslink(forceInterwiki = True))
# Some of the newest versions don't have a "view source" tag for
# non-existant pages
- elif text.find(self.site().mediawiki_message('badaccess')) != -1:
+ # Check also the div class because if the language is not english
+ # the bot can not seeing that the page is blocked.
+ elif text.find(self.site().mediawiki_message('badaccess')) != -1 or \ # continue below
+ text.find("<div class=\"permissions-errors\">") != -1:
raise NoPage(self.site(), self.aslink(forceInterwiki = True))
else:
if text.find( "<title>Wikimedia Error</title>") > -1:
Feature Requests item #1500288, was opened at 2006-06-04 05:09
Message generated for change (Comment added) made by wikipedian
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1500288&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Priority: 5
Private: No
Submitted By: Nobody/Anonymous (nobody)
Assigned to: Nobody/Anonymous (nobody)
Summary: Have weblinkchecker.py check the Internet Archive for backup
Initial Comment:
weblinkchecker.py apparently has an option to take action on finding a broken link (currently only to add something to a talk page; I haven't been able to get this to work, though). But it would be even better if it could insert, in a comment or perhaps an addendum after the broken link, a link to backups of that page in the Internet Archive/Wayback Machine.
I don't think this enhancement would be backbreakingly difficult and troublesome. The script would have to prepend "http://web.archive.org/web/" to the original URL, check whether the string "Not in Archive." (or whatever the current error message is) appears in the Internet Archive page. If it does, then simply carry on with the rest of the links to be checked; if not, if the Archive *does* have something backed up, then take some boilerplate like "The preceding URL appeared to be invalid to weblinkchecker.py; however, backups of the URL can be found in the [[Internet Archive]] $HERE. You may want to consider amending the original link to point to the archived copies and not the live one.", replace $HERE with the URL prepended with the Archive bit, and insert in a comment.
-maru
----------------------------------------------------------------------
>Comment By: Daniel Herding (wikipedian)
Date: 2008-01-31 02:03
Message:
Logged In: YES
user_id=880694
Originator: NO
By the way, I have already implemented Internet Archive lookup long ago.
webcitation.org is not yet supported yet, though.
----------------------------------------------------------------------
Comment By: Nobody/Anonymous (nobody)
Date: 2008-01-30 21:09
Message:
Logged In: NO
Isn't it possible to create a bot that checks when the external links
works again? In this uses the category with inaccessible external links.
When an external link is accessible again the bod removes the message from
the talkpage, the bot marks the talkpage with the template for speedy
deletion.
My apologise if I'm adding this message on the wrong page.
Regards,
Kenny (from the Dutch Wikipedia
http://nl.wikipedia.org/wiki/Gebruiker:Ken123 )
----------------------------------------------------------------------
Comment By: Nobody/Anonymous (nobody)
Date: 2007-06-24 20:42
Message:
Logged In: NO
In the same vein, it would be good if WebCite
<http://www.webcitation.org/> archived pages were included as well. There's
apparently some nice programmatic ways of looking for archived URLs
according to
<http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf>.
While I'm writing, it'd also be good if the bot would proactively archive
pages when they disappear and come back. Variable uptime to me bespeaks a
page that is likely to disappear permanently. It isn't hard either - it's
just
"www.webcitation.org/archive?url=" ++ url ++ "&email=foo(a)bar.com"
----------------------------------------------------------------------
Comment By: Nobody/Anonymous (nobody)
Date: 2007-06-24 20:33
Message:
Logged In: NO
In the same vein, it would be good if WebCite
<http://www.webcitation.org/> archived pages were included as well. There's
apparently some nice programmatic ways of looking for archived URLs
according to
<http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf>.
While I'm writing, it'd also be good if the bot would proactively archive
pages when they disappear and come back. Variable uptime to me bespeaks a
page that is likely to disappear permanently. It isn't hard either - it's
just
"www.webcitation.org/archive?url=" ++ url ++ "&email=foo(a)bar.com"
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1500288&group_…
Feature Requests item #1882959, was opened at 2008-01-30 12:27
Message generated for change (Tracker Item Submitted) made by Item Submitter
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1882959&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Priority: 5
Private: No
Submitted By: Nobody/Anonymous (nobody)
Assigned to: Nobody/Anonymous (nobody)
Summary: inaccessible external links
Initial Comment:
Isn't it possible to create a bot that checks when the external links
works again?
The bot uses the category with inaccessible external links.
When an external link is accessible again the bod removes the message from
the talkpage, the bot marks the talkpage with the template for speedy
deletion.
Regards,
Kenny (from the Dutch Wikipedia
http://nl.wikipedia.org/wiki/Gebruiker:Ken123 )
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1882959&group_…
Feature Requests item #1500288, was opened at 2006-06-03 20:09
Message generated for change (Comment added) made by nobody
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1500288&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Priority: 5
Private: No
Submitted By: Nobody/Anonymous (nobody)
Assigned to: Nobody/Anonymous (nobody)
Summary: Have weblinkchecker.py check the Internet Archive for backup
Initial Comment:
weblinkchecker.py apparently has an option to take action on finding a broken link (currently only to add something to a talk page; I haven't been able to get this to work, though). But it would be even better if it could insert, in a comment or perhaps an addendum after the broken link, a link to backups of that page in the Internet Archive/Wayback Machine.
I don't think this enhancement would be backbreakingly difficult and troublesome. The script would have to prepend "http://web.archive.org/web/" to the original URL, check whether the string "Not in Archive." (or whatever the current error message is) appears in the Internet Archive page. If it does, then simply carry on with the rest of the links to be checked; if not, if the Archive *does* have something backed up, then take some boilerplate like "The preceding URL appeared to be invalid to weblinkchecker.py; however, backups of the URL can be found in the [[Internet Archive]] $HERE. You may want to consider amending the original link to point to the archived copies and not the live one.", replace $HERE with the URL prepended with the Archive bit, and insert in a comment.
-maru
----------------------------------------------------------------------
Comment By: Nobody/Anonymous (nobody)
Date: 2008-01-30 12:09
Message:
Logged In: NO
Isn't it possible to create a bot that checks when the external links
works again? In this uses the category with inaccessible external links.
When an external link is accessible again the bod removes the message from
the talkpage, the bot marks the talkpage with the template for speedy
deletion.
My apologise if I'm adding this message on the wrong page.
Regards,
Kenny (from the Dutch Wikipedia
http://nl.wikipedia.org/wiki/Gebruiker:Ken123 )
----------------------------------------------------------------------
Comment By: Nobody/Anonymous (nobody)
Date: 2007-06-24 11:42
Message:
Logged In: NO
In the same vein, it would be good if WebCite
<http://www.webcitation.org/> archived pages were included as well. There's
apparently some nice programmatic ways of looking for archived URLs
according to
<http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf>.
While I'm writing, it'd also be good if the bot would proactively archive
pages when they disappear and come back. Variable uptime to me bespeaks a
page that is likely to disappear permanently. It isn't hard either - it's
just
"www.webcitation.org/archive?url=" ++ url ++ "&email=foo(a)bar.com"
----------------------------------------------------------------------
Comment By: Nobody/Anonymous (nobody)
Date: 2007-06-24 11:33
Message:
Logged In: NO
In the same vein, it would be good if WebCite
<http://www.webcitation.org/> archived pages were included as well. There's
apparently some nice programmatic ways of looking for archived URLs
according to
<http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf>.
While I'm writing, it'd also be good if the bot would proactively archive
pages when they disappear and come back. Variable uptime to me bespeaks a
page that is likely to disappear permanently. It isn't hard either - it's
just
"www.webcitation.org/archive?url=" ++ url ++ "&email=foo(a)bar.com"
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1500288&group_…