[Pywikipedia-svn] SVN: [9525] trunk/pywikipedia/imageharvest.py

14 Sep 2011

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9525
Revision: 9525
Author:   saper
Date:     2011-09-14 22:54:59 +0000 (Wed, 14 Sep 2011)
Log Message:
-----------
Use BeautifulSoup for getting HTML links and images.
Removed simplistic regular expression based guessing
 of contents of src="" and href="" attributes. 
 Still, treating all URLs ending with '.jpeg' or similar
 is unsuitable for fetching images from MediaWiki
 installations, since /wiki/File:Picture.jpg links are
 pointing to the description pages, not the pictures
 themselves.
Modified Paths:
--------------
    trunk/pywikipedia/imageharvest.py
Modified: trunk/pywikipedia/imageharvest.py
===================================================================

--- trunk/pywikipedia/imageharvest.py	2011-09-13 15:58:36 UTC (rev 9524)
+++ trunk/pywikipedia/imageharvest.py	2011-09-14 22:54:59 UTC (rev 9525)
@@ -20,36 +20,31 @@
import re, sys, os
 import wikipedia as pywikibot
+import urllib
+import BeautifulSoup
 import upload
def get_imagelinks(url):
-    # Given a URL, get all images linked to by the page at that URL.
-    # First, we get the location for relative links from the URL.
-    relativepath = url.split("/")
-    if len(relativepath) == 1:
-        relativepath=relativepath[0]
-    else:
-        relativepath=relativepath[:len(relativepath)-1]
-        relativepath="/".join(relativepath)
+    """Given a URL, get all images linked to by the page at that URL."""
+
     links = []
     uo = pywikibot.MyURLopener
     file = uo.open(url)
-    text = file.read()
+    soup = BeautifulSoup.BeautifulSoup(file.read())
     file.close()
-    text = text.lower()
     if not shown:
-        R=re.compile("href\s*=\s*["'](.*?)["']")
+	tagname = "a"
     elif shown == "just":
-        R=re.compile("src\s*=s*["'](.*?)["']")
+	tagname = "img"
     else:
-        R=re.compile("["'](.*?)["']")
-    for link in R.findall(text):
-        ext = os.path.splitext(link)[1].lower().strip('.')
-        if ext in fileformats:
-            if re.compile("://").match(text):
-                links += [link]
-            else:
-                links += [relativepath+"/"+link]
+        tagname = ["a", "img"]
+
+    for tag in soup.findAll(tagname):
+	link = tag.get("src", tag.get("href", None))
+	if link:
+            ext = os.path.splitext(link)[1].lower().strip('.')
+            if ext in fileformats:
+                 links.append(urllib.basejoin(url, link))
     return links
def main(give_url, image_url, desc):

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Pywikipedia-svn] SVN: [9525] trunk/pywikipedia/imageharvest.py