http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9532
Revision: 9532
Author: xqt
Date: 2011-09-18 17:10:08 +0000 (Sun, 18 Sep 2011)
Log Message:
-----------
do not localize #REDIRECT tag if the target page is the same
Modified Paths:
--------------
trunk/pywikipedia/redirect.py
Modified: trunk/pywikipedia/redirect.py
===================================================================
--- trunk/pywikipedia/redirect.py 2011-09-18 16:53:51 UTC (rev 9531)
+++ trunk/pywikipedia/redirect.py 2011-09-18 17:10:08 UTC (rev 9532)
@@ -659,7 +659,7 @@
'#%s %s' % (self.site.redirect(),
targetPage.title(asLink=True, textlink=True)),
oldText)
- if text == oldText:
+ if redir.title() == targetPage.title() or text == oldText:
pywikibot.output(u"Note: Nothing left to do on %s"
% redir.title(asLink=True))
break
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9529
Revision: 9529
Author: valhallasw
Date: 2011-09-18 15:26:53 +0000 (Sun, 18 Sep 2011)
Log Message:
-----------
Two bugfixes for r9528:
- added support for version numbers differing from 'x.y.z', eg '1.17wmf1'
- fixed error message when there is no sign of the API
Modified Paths:
--------------
trunk/pywikipedia/generate_family_file.py
Modified: trunk/pywikipedia/generate_family_file.py
===================================================================
--- trunk/pywikipedia/generate_family_file.py 2011-09-18 15:19:02 UTC (rev 9528)
+++ trunk/pywikipedia/generate_family_file.py 2011-09-18 15:26:53 UTC (rev 9529)
@@ -17,7 +17,7 @@
import urllib2
from BeautifulSoup import BeautifulSoup
-from distutils.version import StrictVersion as V
+from distutils.version import LooseVersion as V
def urlopen(url):
req = urllib2.Request(url, headers = {'User-agent': 'Pywikipedia family generator 0.1 - pywikipediabot.sf.net'})
@@ -219,6 +219,7 @@
REwgVersion = re.compile(ur'wgVersion ?= ?"([^"]*)"')
def __init__(self, fromurl):
+ self.fromurl = fromurl
if fromurl.endswith("$1"):
fromurl = fromurl[:-2]
try:
@@ -242,7 +243,7 @@
def _parse_pre_117(self, data):
if not self.REwgEnableApi.search(data):
- print "*** WARNING: Api does not seem to be enabled on %s" % fromurl
+ print "*** WARNING: Api does not seem to be enabled on %s" % self.fromurl
try:
self.version = self.REwgVersion.search(data).groups()[0]
except AttributeError:
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9525
Revision: 9525
Author: saper
Date: 2011-09-14 22:54:59 +0000 (Wed, 14 Sep 2011)
Log Message:
-----------
Use BeautifulSoup for getting HTML links and images.
Removed simplistic regular expression based guessing
of contents of src="" and href="" attributes.
Still, treating all URLs ending with '.jpeg' or similar
is unsuitable for fetching images from MediaWiki
installations, since /wiki/File:Picture.jpg links are
pointing to the description pages, not the pictures
themselves.
Modified Paths:
--------------
trunk/pywikipedia/imageharvest.py
Modified: trunk/pywikipedia/imageharvest.py
===================================================================
--- trunk/pywikipedia/imageharvest.py 2011-09-13 15:58:36 UTC (rev 9524)
+++ trunk/pywikipedia/imageharvest.py 2011-09-14 22:54:59 UTC (rev 9525)
@@ -20,36 +20,31 @@
import re, sys, os
import wikipedia as pywikibot
+import urllib
+import BeautifulSoup
import upload
def get_imagelinks(url):
- # Given a URL, get all images linked to by the page at that URL.
- # First, we get the location for relative links from the URL.
- relativepath = url.split("/")
- if len(relativepath) == 1:
- relativepath=relativepath[0]
- else:
- relativepath=relativepath[:len(relativepath)-1]
- relativepath="/".join(relativepath)
+ """Given a URL, get all images linked to by the page at that URL."""
+
links = []
uo = pywikibot.MyURLopener
file = uo.open(url)
- text = file.read()
+ soup = BeautifulSoup.BeautifulSoup(file.read())
file.close()
- text = text.lower()
if not shown:
- R=re.compile("href\s*=\s*[\"'](.*?)[\"']")
+ tagname = "a"
elif shown == "just":
- R=re.compile("src\s*=s*[\"'](.*?)[\"']")
+ tagname = "img"
else:
- R=re.compile("[\"'](.*?)[\"']")
- for link in R.findall(text):
- ext = os.path.splitext(link)[1].lower().strip('.')
- if ext in fileformats:
- if re.compile("://").match(text):
- links += [link]
- else:
- links += [relativepath+"/"+link]
+ tagname = ["a", "img"]
+
+ for tag in soup.findAll(tagname):
+ link = tag.get("src", tag.get("href", None))
+ if link:
+ ext = os.path.splitext(link)[1].lower().strip('.')
+ if ext in fileformats:
+ links.append(urllib.basejoin(url, link))
return links
def main(give_url, image_url, desc):