http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9525
Revision: 9525 Author: saper Date: 2011-09-14 22:54:59 +0000 (Wed, 14 Sep 2011) Log Message: ----------- Use BeautifulSoup for getting HTML links and images.
Removed simplistic regular expression based guessing of contents of src="" and href="" attributes. Still, treating all URLs ending with '.jpeg' or similar is unsuitable for fetching images from MediaWiki installations, since /wiki/File:Picture.jpg links are pointing to the description pages, not the pictures themselves.
Modified Paths: -------------- trunk/pywikipedia/imageharvest.py
Modified: trunk/pywikipedia/imageharvest.py =================================================================== --- trunk/pywikipedia/imageharvest.py 2011-09-13 15:58:36 UTC (rev 9524) +++ trunk/pywikipedia/imageharvest.py 2011-09-14 22:54:59 UTC (rev 9525) @@ -20,36 +20,31 @@
import re, sys, os import wikipedia as pywikibot +import urllib +import BeautifulSoup import upload
def get_imagelinks(url): - # Given a URL, get all images linked to by the page at that URL. - # First, we get the location for relative links from the URL. - relativepath = url.split("/") - if len(relativepath) == 1: - relativepath=relativepath[0] - else: - relativepath=relativepath[:len(relativepath)-1] - relativepath="/".join(relativepath) + """Given a URL, get all images linked to by the page at that URL.""" + links = [] uo = pywikibot.MyURLopener file = uo.open(url) - text = file.read() + soup = BeautifulSoup.BeautifulSoup(file.read()) file.close() - text = text.lower() if not shown: - R=re.compile("href\s*=\s*["'](.*?)["']") + tagname = "a" elif shown == "just": - R=re.compile("src\s*=s*["'](.*?)["']") + tagname = "img" else: - R=re.compile("["'](.*?)["']") - for link in R.findall(text): - ext = os.path.splitext(link)[1].lower().strip('.') - if ext in fileformats: - if re.compile("://").match(text): - links += [link] - else: - links += [relativepath+"/"+link] + tagname = ["a", "img"] + + for tag in soup.findAll(tagname): + link = tag.get("src", tag.get("href", None)) + if link: + ext = os.path.splitext(link)[1].lower().strip('.') + if ext in fileformats: + links.append(urllib.basejoin(url, link)) return links
def main(give_url, image_url, desc):