SVN: [11622] trunk/pywikipedia/catimages.py - Pywikipedia-svn

7 Jun 2013

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11622
Revision: 11622
Author:   drtrigon
Date:     2013-06-07 18:21:31 +0000 (Fri, 07 Jun 2013)
Log Message:
-----------
improvement; format conversion in downloadImage and _detect_Properties_PIL re-aranged
Modified Paths:
--------------
    trunk/pywikipedia/catimages.py
Modified: trunk/pywikipedia/catimages.py
===================================================================

--- trunk/pywikipedia/catimages.py	2013-06-07 14:53:04 UTC (rev 11621)
+++ trunk/pywikipedia/catimages.py	2013-06-07 18:21:31 UTC (rev 11622)
@@ -146,124 +146,36 @@
class UnknownFile(object):
     def __init__(self, filename, *args, **kwargs):
-        self.filename = filename
+        self.filename   = filename
+        self.image_size = (None, None)
+
         self._info = {}
-    def getProperties(self):
-        # check/look into the file by midnight commander (mc)
-        # https://pypi.python.org/pypi/hachoir-metadata
-        #self._detect_HeaderAndMetadata()
+    def __enter__(self):
+        return self
-        # get mime-type file-size, ...
+    def __exit__(self, type, value, traceback):
         pass
+    def getProperties(self):
+        result = {}
+        result.update( self._detect_HeaderAndMetadata() )   # Metadata
+        result.update( self._detect_Properties_PIL() )      # Properties
+        return result
+
     def getFeatures(self):
-        pass
+        pywikibot.warning(u"File format '%s/%s' not supported (yet)!" % tuple(self.image_mime[:2]))
+    def _detect_HeaderAndMetadata(self):
+        # check/look into the file by midnight commander (mc)
+        # https://pypi.python.org/pypi/hachoir-metadata
+        return {}
+
     def _detect_Properties_PIL(self):
-        """Retrieve as much file property info possible, especially the same
-           as commons does in order to compare if those libraries (ImageMagick,
-           ...) are buggy (thus explicitely use other software for independence)"""
-        #self.image_size = (None, None)
-        result = {'Format': u'-', 'Pages': 0}
-        if self.image_fileext == u'.svg':   # MIME: 'application/xml; charset=utf-8'
-            # similar to PDF page count OR use BeautifulSoup
-            svgcountpages = re.compile("<page>")
-            pc = len(svgcountpages.findall( file(self.image_path,"r").read() ))
+        # get mime-type file-size, ...
+        return {}
-            #svg = rsvg.Handle(self.image_path)
-            # http://validator.w3.org/docs/api.html#libs
-            # http://pypi.python.org/pypi/py_w3c/
-            vld = HTMLValidator()
-            valid = u'SVG'
-            try:
-                vld.validate(self.image.fileUrl())
-                valid = (u'Valid SVG' if vld.result.validity == 'true' else u'Invalid SVG')
-            except urllib2.URLError:
-                pass
-            except ValidationFault:
-                pass
-            #print vld.errors, vld.warnings
-
-            #self.image_size = (svg.props.width, svg.props.height)
-
-            result = { 'Format':     valid,
-                       'Mode':       u'-',
-                       'Palette':    u'-',
-                       'Pages':      pc, }
-            # may be set {{validSVG}} also or do something in bot template to
-            # recognize 'Format=SVG (valid)' ...
-        elif self.image_mime[1] == 'pdf':   # MIME: 'application/pdf; charset=binary'
-            # http://code.activestate.com/recipes/496837-count-pdf-pages/
-            #rxcountpages = re.compile(r"$\s*/Type\s*/Page[/\s]", re.MULTILINE|re.DOTALL)
-            rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)    # PDF v. 1.3,1.4,1.5,1.6
-            pc = len(rxcountpages.findall( file(self.image_path,"rb").read() ))
-
-            result = { 'Format':     u'PDF',
-                       'Mode':       u'-',
-                       'Palette':    u'-',
-                       'Pages':      pc, }
-        elif self.image_mime[1] == 'x-xcf': # MIME: 'image/x-xcf; charset=binary'
-            result = { 'Format': u'%s' % self.image_mime[1].upper() }
-            # DO NOT use ImageMagick (identify) instead of PIL to get these info !!
-        elif (self.image_mime[0] == 'image') and \
-             (self.image_mime[1] != 'vnd.djvu'):   # MIME: 'image/jpeg; charset=binary', ...
-            try:
-                i = Image.open(self.image_path)
-            except IOError:
-                pywikibot.warning(u'unknown (image) file type [_detect_Properties_PIL]')
-                return {'Properties': [result]}
-
-            # http://mail.python.org/pipermail/image-sig/1999-May/000740.html
-            pc=0         # count number of pages
-            while True:
-                try:
-                    i.seek(pc)
-                except EOFError:
-                    break
-                pc+=1
-            i.seek(0)    # restore default
-
-            # http://grokbase.com/t/python/image-sig/082psaxt6k/embedded-icc-profiles
-            # python-lcms (littlecms) may be freeimage library
-            #icc = i.app['APP2']     # jpeg
-            #icc = i.tag[34675]      # tiff
-            #icc = re.sub('[^%s]'%string.printable, ' ', icc)
-            ## more image formats and more post-processing needed...
-
-            #self.image_size = i.size
-
-            result = { #'bands':      i.getbands(),
-                       #'bbox':       i.getbbox(),
-                       'Format':     i.format,
-                       'Mode':       i.mode,
-                       #'info':       i.info,
-                       #'stat':       os.stat(self.image_path),
-                       'Palette':    str(len(i.palette.palette)) if i.palette else u'-',
-                       'Pages':      pc, }
-        elif self.image_mime[1] == 'ogg':   # MIME: 'application/ogg; charset=binary'
-            # 'ffprobe' (ffmpeg); audio and video streams files (ogv, oga, ...)
-            d = self._util_get_DataStreams_FFMPEG()
-            #print d
-            result['Format'] = u'%s' % d['format']['format_name'].upper()
-        elif self.image_mime[1] == 'midi': # MIME: 'audio/midi; charset=binary'
-            result['Format'] = u'%s' % self.image_mime[1].upper()
-        # djvu: python-djvulibre or python-djvu for djvu support
-        # http://pypi.python.org/pypi/python-djvulibre/0.3.9
-        else:
-            pywikibot.warning(u'unknown (generic) file type [_detect_Properties_PIL]')
-            return {'Properties': [result]}
-
-        result['Dimensions'] = self.image_size
-        result['Filesize']   = os.path.getsize(self.image_path)
-        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
-
-        #self._info['Properties'] = [result]
-        self._info['Properties'][0].update(result)
-        return {'Properties': [result]}
-
-
 class JpegFile(UnknownFile):
     # for '_detect_Trained_CV'
     cascade_files = [(u'Legs', 'haarcascade_lowerbody.xml'),
@@ -280,12 +192,25 @@
                      # ('Hands' does not behave very well, in fact it detects any kind of skin and other things...)
                      #(u'Aeroplanes', 'haarcascade_aeroplane.xml'),]      # e.g. for 'Category:Unidentified aircraft'
-    def getProperties(self):
-        pass
+    def __init__(self, filename, *args, **kwargs):
+        UnknownFile.__init__(self, filename)
-        # Image size
-        self._detect_Properties_PIL()
+        self.image_filename  = os.path.split(self.filename)[-1]
+        self.image_fileext   = os.path.splitext(self.image_filename)[1]
+        self.image_path      = os.path.join(scriptdir, ('cache/' + self.image_filename[-128:]))
+        self.image_path_JPEG = self.image_path + '.jpg'
+        self._convert()
+
+    def __exit__(self, type, value, traceback):
+        if os.path.exists(self.image_path):
+            os.remove( self.image_path )
+        if os.path.exists(self.image_path_JPEG):
+            os.remove( self.image_path_JPEG )
+        #image_path_new = self.image_path_JPEG.replace(u"cache/", u"cache/0_DETECTED_")
+        #if os.path.exists(image_path_new):
+        #    os.remove( image_path_new )
+
     def getFeatures(self):
         # Faces (extract EXIF data)
         self._detect_Faces_EXIF()
@@ -336,6 +261,87 @@
         # general file EXIF history information
         self._detect_History_EXIF()
+    # supports a lot of different file types thanks to PIL
+    def _convert(self):
+        try:
+            im = Image.open(self.image_path) # might be png, gif etc, for instance
+            #im.thumbnail(size, Image.ANTIALIAS) # size is 640x480
+            im.convert('RGB').save(self.image_path_JPEG, "JPEG")
+
+            self.image_size = im.size
+        except IOError, e:
+            if 'image file is truncated' in str(e):
+                # im object has changed due to exception raised
+                im.convert('RGB').save(self.image_path_JPEG, "JPEG")
+
+                self.image_size = im.size
+            else:
+                try:
+                    # since opencv might still work, try this as fall-back
+                    img = cv2.imread( self.image_path, cv.CV_LOAD_IMAGE_COLOR )
+                    cv2.imwrite(self.image_path_JPEG, img)
+
+                    self.image_size = (img.shape[1], img.shape[0])
+                except:
+                    if os.path.exists(self.image_path_JPEG):
+                        os.remove(self.image_path_JPEG)
+                    self.image_path_JPEG = self.image_path
+        except:
+            self.image_path_JPEG = self.image_path
+
+        # FULL TIFF support (e.g. group4)
+        # http://code.google.com/p/pylibtiff/
+
+    # MIME: 'image/jpeg; charset=binary', ...
+    def _detect_Properties_PIL(self):
+        """Retrieve as much file property info possible, especially the same
+           as commons does in order to compare if those libraries (ImageMagick,
+           ...) are buggy (thus explicitely use other software for independence)"""
+        #self.image_size = (None, None)
+        result = {'Format': u'-', 'Pages': 0}
+
+        try:
+            i = Image.open(self.image_path)
+        except IOError:
+            pywikibot.warning(u'unknown file type [JpegFile]')
+            return {'Properties': [result]}
+
+        # http://mail.python.org/pipermail/image-sig/1999-May/000740.html
+        pc=0         # count number of pages
+        while True:
+            try:
+                i.seek(pc)
+            except EOFError:
+                break
+            pc+=1
+        i.seek(0)    # restore default
+
+        # http://grokbase.com/t/python/image-sig/082psaxt6k/embedded-icc-profiles
+        # python-lcms (littlecms) may be freeimage library
+        #icc = i.app['APP2']     # jpeg
+        #icc = i.tag[34675]      # tiff
+        #icc = re.sub('[^%s]'%string.printable, ' ', icc)
+        ## more image formats and more post-processing needed...
+
+        #self.image_size = i.size
+
+        result = { #'bands':      i.getbands(),
+                   #'bbox':       i.getbbox(),
+                   'Format':     i.format,
+                   'Mode':       i.mode,
+                   #'info':       i.info,
+                   #'stat':       os.stat(self.image_path),
+                   'Palette':    str(len(i.palette.palette)) if i.palette else u'-',
+                   'Pages':      pc, }
+
+        result['Dimensions'] = self.image_size
+        result['Filesize']   = os.path.getsize(self.image_path)
+        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+
+        #self._info['Properties'] = [result]
+        self._info['Properties'][0].update(result)
+        return {'Properties': [result]}
+
     # .../opencv/samples/c/facedetect.cpp
     # http://opencv.willowgarage.com/documentation/python/genindex.html
     def _detect_Faces_CV(self):
@@ -2477,15 +2483,162 @@
 class TiffFile(JpegFile):
     pass
+
 class XcfFile(JpegFile):
-    pass
+    def _convert(self):
+        # Very few programs other than GIMP read XCF files. This is by design
+        # from the GIMP developers, the format is not really documented or
+        # supported as a general-purpose file format.
+        # Commons uses ImageMagick, thus we have EXACTLY THE SAME support!
+        # (can also be a drawback, e.g. when the library is buggy...)
+        proc = Popen("convert %s %s" % (self.image_path, self.image_path_JPEG),
+                     shell=True, stderr=PIPE)#.stderr.read()
+        proc.wait()
+        if   proc.returncode != 0:
+            raise ImportError("convert (ImageMagick) not found (may be other error occured)!")
+        elif proc.returncode:
+            self.image_path_JPEG = self.image_path
+        #data = Popen("identify -verbose info: %s" % self.image_path,
+        #             shell=True, stderr=PIPE).stderr.read()
+        #print data
+        if not os.path.exists(self.image_path_JPEG):
+            # xcf can have more than 1 layer/page like gif, tiff, and movies...
+            self.image_path_JPEG = self.image_path_JPEG.replace('.jpg', '-0.jpg')
+        self.image_size = Image.open(self.image_path_JPEG).size
+
+    # MIME: 'image/x-xcf; charset=binary'
+    def _detect_Properties_PIL(self):
+        """Retrieve as much file property info possible, especially the same
+           as commons does in order to compare if those libraries (ImageMagick,
+           ...) are buggy (thus explicitely use other software for independence)"""
+        #self.image_size = (None, None)
+        result = {'Format': u'-', 'Pages': 0}
+
+        result = { 'Format': u'%s' % self.image_mime[1].upper() }
+        # DO NOT use ImageMagick (identify) instead of PIL to get these info !!
+
+        result['Dimensions'] = self.image_size
+        result['Filesize']   = os.path.getsize(self.image_path)
+        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+
+        #self._info['Properties'] = [result]
+        self._info['Properties'][0].update(result)
+        return {'Properties': [result]}
+
+
 class SvgFile(JpegFile):
-    pass
+    def _convert(self):
+        # SVG: rasterize the SVG to bitmap (MAY BE GET FROM WIKI BY DOWNLOAD?...)
+        # (Mediawiki uses librsvg too: http://commons.wikimedia.org/wiki/SVG#SVGs_in_MediaWiki)
+        # http://stackoverflow.com/questions/6589358/convert-svg-to-png-in-python
+        # http://cairographics.org/pythoncairopil/
+        # http://cairographics.org/pyrsvg/
+        # http://stackoverflow.com/questions/9166400/convert-rgba-png-to-rgb-with-pil
+        try:
+            svg = rsvg.Handle(self.image_path)
+            img = cairo.ImageSurface(cairo.FORMAT_ARGB32, svg.props.width, svg.props.height)
+            ctx = cairo.Context(img)
+            svg.render_cairo(ctx)
+            #img.write_to_png("svg.png")
+            #Image.frombuffer("RGBA",( img.get_width(),img.get_height() ),
+            #             img.get_data(),"raw","RGBA",0,1).save(self.image_path_JPEG, "JPEG")
+            png = Image.frombuffer("RGBA",( img.get_width(),img.get_height() ),
+                               img.get_data(),"raw","RGBA",0,1)
+            background = Image.new("RGB", png.size, (255, 255, 255))
+            background.paste(png, mask=png.split()[3]) # 3 is the alpha channel
+            background.save(self.image_path_JPEG, "JPEG")
+            self.image_size = (svg.props.width, svg.props.height)
+        except MemoryError:
+            self.image_path_JPEG = self.image_path
+        except SystemError:
+            self.image_path_JPEG = self.image_path
+
+    # MIME: 'application/xml; charset=utf-8'
+    def _detect_Properties_PIL(self):
+        """Retrieve as much file property info possible, especially the same
+           as commons does in order to compare if those libraries (ImageMagick,
+           ...) are buggy (thus explicitely use other software for independence)"""
+        #self.image_size = (None, None)
+        result = {'Format': u'-', 'Pages': 0}
+
+        # similar to PDF page count OR use BeautifulSoup
+        svgcountpages = re.compile("<page>")
+        pc = len(svgcountpages.findall( file(self.image_path,"r").read() ))
+
+        #svg = rsvg.Handle(self.image_path)
+
+        # http://validator.w3.org/docs/api.html#libs
+        # http://pypi.python.org/pypi/py_w3c/
+        vld = HTMLValidator()
+        valid = u'SVG'
+        try:
+            vld.validate(self.image.fileUrl())
+            valid = (u'Valid SVG' if vld.result.validity == 'true' else u'Invalid SVG')
+        except urllib2.URLError:
+            pass
+        except ValidationFault:
+            pass
+        #print vld.errors, vld.warnings
+
+        #self.image_size = (svg.props.width, svg.props.height)
+
+        result = { 'Format':     valid,
+                   'Mode':       u'-',
+                   'Palette':    u'-',
+                   'Pages':      pc, }
+        # may be set {{validSVG}} also or do something in bot template to
+        # recognize 'Format=SVG (valid)' ...
+
+        result['Dimensions'] = self.image_size
+        result['Filesize']   = os.path.getsize(self.image_path)
+        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+
+        #self._info['Properties'] = [result]
+        self._info['Properties'][0].update(result)
+        return {'Properties': [result]}
+
+
 class PdfFile(JpegFile):
-    pass
+    def _convert(self):
+#        self._wikidata = self.image._latestInfo # all info wikimedia got from content (mime, sha1, ...)
+        # PDF: support extract text and images
+        # (Mediawiki uses ghostscript: https://www.mediawiki.org/wiki/Extension:PdfHandler#Pre-requisites)
+        # http://vermeulen.ca/python-pdf.html
+        # http://code.activestate.com/recipes/511465-pure-python-pdf-to-text-converter...
+        # http://stackoverflow.com/questions/25665/python-module-for-converting-pdf-to...
+        if self.image_fileext == u'.pdf':
+            pass
+
+    # MIME: 'application/pdf; charset=binary'
+    def _detect_Properties_PIL(self):
+        """Retrieve as much file property info possible, especially the same
+           as commons does in order to compare if those libraries (ImageMagick,
+           ...) are buggy (thus explicitely use other software for independence)"""
+        #self.image_size = (None, None)
+        result = {'Format': u'-', 'Pages': 0}
+
+        # http://code.activestate.com/recipes/496837-count-pdf-pages/
+        #rxcountpages = re.compile(r"$\s*/Type\s*/Page[/\s]", re.MULTILINE|re.DOTALL)
+        rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)    # PDF v. 1.3,1.4,1.5,1.6
+        pc = len(rxcountpages.findall( file(self.image_path,"rb").read() ))
+
+        result = { 'Format':     u'PDF',
+                   'Mode':       u'-',
+                   'Palette':    u'-',
+                   'Pages':      pc, }
+
+        result['Dimensions'] = self.image_size
+        result['Filesize']   = os.path.getsize(self.image_path)
+        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+
+        #self._info['Properties'] = [result]
+        self._info['Properties'][0].update(result)
+        return {'Properties': [result]}
+
+
 #class DjvuFile(JpegFile):
 #    pass
@@ -2498,6 +2651,27 @@
         # general audio feature extraction
 #        self._detect_AudioFeatures_YAAFE()
+    # MIME: 'application/ogg; charset=binary'
+    def _detect_Properties_PIL(self):
+        """Retrieve as much file property info possible, especially the same
+           as commons does in order to compare if those libraries (ImageMagick,
+           ...) are buggy (thus explicitely use other software for independence)"""
+        #self.image_size = (None, None)
+        result = {'Format': u'-', 'Pages': 0}
+
+        # 'ffprobe' (ffmpeg); audio and video streams files (ogv, oga, ...)
+        d = self._util_get_DataStreams_FFMPEG()
+        #print d
+        result['Format'] = u'%s' % d['format']['format_name'].upper()
+
+        result['Dimensions'] = self.image_size
+        result['Filesize']   = os.path.getsize(self.image_path)
+        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+
+        #self._info['Properties'] = [result]
+        self._info['Properties'][0].update(result)
+        return {'Properties': [result]}
+
     def _detect_Streams_FFMPEG(self):
         self._info['Streams'] = []
@@ -2696,12 +2870,6 @@
class MidiFile(UnknownFile):
-    def getProperties(self):
-        result = {}
-        result.update( self._detect_HeaderAndMetadata() )   # Metadata
-        result.update( self._detect_Properties_PIL() )      # Properties
-        return result
-
     def getFeatures(self):
         result = {}
         result.update( self._detect_AudioFeatures_MUSIC21() )   # Audio
@@ -2739,6 +2907,24 @@
         self._info['Metadata'] = [result]
         return {'Metadata': [result]}
+    # MIME: 'audio/midi; charset=binary'
+    def _detect_Properties_PIL(self):
+        """Retrieve as much file property info possible, especially the same
+           as commons does in order to compare if those libraries (ImageMagick,
+           ...) are buggy (thus explicitely use other software for independence)"""
+        #self.image_size = (None, None)
+        result = {'Format': u'-', 'Pages': 0}
+
+        result['Format'] = u'%s' % self.image_mime[1].upper()
+
+        result['Dimensions'] = self.image_size
+        result['Filesize']   = os.path.getsize(self.image_path)
+        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+
+        #self._info['Properties'] = [result]
+        self._info['Properties'][0].update(result)
+        return {'Properties': [result]}
+
     # midi audio feature extraction
     def _detect_AudioFeatures_MUSIC21(self):
         # skip file formats not supported
@@ -2805,6 +2991,8 @@
              (      'image',    'x-xcf'): XcfFile,
              (      'image',  'svg+xml'): SvgFile,
              ('application',      'pdf'): PdfFile,
+# djvu: python-djvulibre or python-djvu for djvu support
+# http://pypi.python.org/pypi/python-djvulibre/0.3.9
 #             (      'image', 'vnd.djvu'): DjvuFile,
              ('application',      'ogg'): OggFile,
              (      'audio',     'midi'): MidiFile,}
@@ -3257,12 +3445,10 @@
         #print self.image_path
         pywikibot.output(u'Processing media %s ...' % self.image.title(asLink=True))
-        self.image_filename  = os.path.split(self.image.fileUrl())[-1]
-        self.image_fileext   = os.path.splitext(self.image_filename)[1]
-        self.image_path      = urllib2.quote(os.path.join(scriptdir, ('cache/' + self.image_filename[-128:])))
-        
-        self.image_path_JPEG = self.image_path + u'.jpg'
-        
+        image_filename  = os.path.split(self.image.fileUrl())[-1]
+        image_fileext   = os.path.splitext(image_filename)[1]
+        self.image_path = urllib2.quote(os.path.join(scriptdir, ('cache/' + image_filename[-128:])))
+
         self._wikidata = self.image._latestInfo # all info wikimedia got from content (mime, sha1, ...)
         #print self._wikidata
         #print self._wikidata['mime']
@@ -3270,7 +3456,7 @@
         #print self._wikidata['metadata']
         #for item in self._wikidata['metadata']:
         #    print item['name'], item['value']
-        
+
         if not os.path.exists(self.image_path):
             pywikibot.get_throttle()
             f_url, data = self.site.getUrl(self.image.fileUrl(), no_hostname=True, 
@@ -3279,7 +3465,7 @@
             # (allows to re-read from back_response)
             data = f_url.read()
             del f_url   # free some memory (no need to keep a copy...)
-    
+
             f = open(self.image_path, 'wb')
             f.write( data )
             f.close()
@@ -3290,95 +3476,9 @@
         self.image_mime = re.split('[/;\s]', m.file(self.image_path))
         #self.image_size = (None, None)
         mime = mimetypes.guess_all_extensions('%s/%s' % tuple(self.image_mime[0:2]))
-        if mime and (self.image_fileext.lower() not in mime):
+        if mime and (image_fileext.lower() not in mime):
             pywikibot.warning(u'File extension does not match MIME type! File extension should be %s.' % mime)
-        # SVG: rasterize the SVG to bitmap (MAY BE GET FROM WIKI BY DOWNLOAD?...)
-        # (Mediawiki uses librsvg too: http://commons.wikimedia.org/wiki/SVG#SVGs_in_MediaWiki)
-        # http://stackoverflow.com/questions/6589358/convert-svg-to-png-in-python
-        # http://cairographics.org/pythoncairopil/
-        # http://cairographics.org/pyrsvg/
-        # http://stackoverflow.com/questions/9166400/convert-rgba-png-to-rgb-with-pil
-        self.image_size = (None, None)
-        if   self.image_fileext == u'.svg':
-            try:
-                svg = rsvg.Handle(self.image_path)
-                img = cairo.ImageSurface(cairo.FORMAT_ARGB32, svg.props.width, svg.props.height)
-                ctx = cairo.Context(img)
-                svg.render_cairo(ctx)
-                #img.write_to_png("svg.png")
-                #Image.frombuffer("RGBA",( img.get_width(),img.get_height() ),
-                #             img.get_data(),"raw","RGBA",0,1).save(self.image_path_JPEG, "JPEG")
-                png = Image.frombuffer("RGBA",( img.get_width(),img.get_height() ),
-                                   img.get_data(),"raw","RGBA",0,1)
-                background = Image.new("RGB", png.size, (255, 255, 255))
-                background.paste(png, mask=png.split()[3]) # 3 is the alpha channel
-                background.save(self.image_path_JPEG, "JPEG")
-
-                self.image_size = (svg.props.width, svg.props.height)
-            except MemoryError:
-                self.image_path_JPEG = self.image_path
-            except SystemError:
-                self.image_path_JPEG = self.image_path
-        elif self.image_fileext == u'.xcf':
-            # Very few programs other than GIMP read XCF files. This is by design
-            # from the GIMP developers, the format is not really documented or
-            # supported as a general-purpose file format.
-            # Commons uses ImageMagick, thus we have EXACTLY THE SAME support!
-            # (can also be a drawback, e.g. when the library is buggy...)
-            proc = Popen("convert %s %s" % (self.image_path, self.image_path_JPEG),
-                         shell=True, stderr=PIPE)#.stderr.read()
-            proc.wait()
-            if   proc.returncode == 127:
-                raise ImportError("convert (ImageMagick) not found!")
-            elif proc.returncode:
-                self.image_path_JPEG = self.image_path
-
-            #data = Popen("identify -verbose info: %s" % self.image_path,
-            #             shell=True, stderr=PIPE).stderr.read()
-            #print data
-            if not os.path.exists(self.image_path_JPEG):
-                # xcf can have more than 1 layer/page like gif, tiff, and movies...
-                self.image_path_JPEG = self.image_path_JPEG.replace('.jpg', '-0.jpg')
-            self.image_size = Image.open(self.image_path_JPEG).size
-        else:
-            try:
-                im = Image.open(self.image_path) # might be png, gif etc, for instance
-                #im.thumbnail(size, Image.ANTIALIAS) # size is 640x480
-                im.convert('RGB').save(self.image_path_JPEG, "JPEG")
-
-                self.image_size = im.size
-            except IOError, e:
-                if 'image file is truncated' in str(e):
-                    # im object has changed due to exception raised
-                    im.convert('RGB').save(self.image_path_JPEG, "JPEG")
-
-                    self.image_size = im.size
-                else:
-                    try:
-                        # since opencv might still work, try this as fall-back
-                        img = cv2.imread( self.image_path, cv.CV_LOAD_IMAGE_COLOR )
-                        cv2.imwrite(self.image_path_JPEG, img)
-
-                        self.image_size = (img.shape[1], img.shape[0])
-                    except:
-                        if os.path.exists(self.image_path_JPEG):
-                            os.remove(self.image_path_JPEG)
-                        self.image_path_JPEG = self.image_path
-            except:
-                self.image_path_JPEG = self.image_path
-
-        # PDF: support extract text and images
-        # (Mediawiki uses ghostscript: https://www.mediawiki.org/wiki/Extension:PdfHandler#Pre-requisites)
-        # http://vermeulen.ca/python-pdf.html
-        # http://code.activestate.com/recipes/511465-pure-python-pdf-to-text-converter...
-        # http://stackoverflow.com/questions/25665/python-module-for-converting-pdf-to...
-        if self.image_fileext == u'.pdf':
-            pass
-        
-        # FULL TIFF support (e.g. group4)
-        # http://code.google.com/p/pylibtiff/
-
     # LOOK ALSO AT: checkimages.CatImagesBot.checkStep
     # (and category scripts/bots too...)
     def checkStep(self):
@@ -3564,13 +3664,14 @@
         return u"\n".join( ret )
def clean_cache(self):
-        if os.path.exists(self.image_path):
-            os.remove( self.image_path )
-        if os.path.exists(self.image_path_JPEG):
-            os.remove( self.image_path_JPEG )
-        #image_path_new = self.image_path_JPEG.replace(u"cache/", u"cache/0_DETECTED_")
-        #if os.path.exists(image_path_new):
-        #    os.remove( image_path_new )
+#        if os.path.exists(self.image_path):
+#            os.remove( self.image_path )
+#        if os.path.exists(self.image_path_JPEG):
+#            os.remove( self.image_path_JPEG )
+#        #image_path_new = self.image_path_JPEG.replace(u"cache/", u"cache/0_DETECTED_")
+#        #if os.path.exists(image_path_new):
+#        #    os.remove( image_path_new )
+        pass
# LOOK ALSO AT: checkimages.CatImagesBot.report
     def report(self):
@@ -3718,17 +3819,18 @@
# split detection and extraction according to file types; JpegFile, ...
         TypeFile = FILETYPES.get(tuple(self.image_mime[:2]), FILETYPES['*'])
-        tf = TypeFile(self.image_path)
-        import copy
-        tf.__dict__.update(copy.deepcopy(self.__dict__))
-        for func in ['getProperties', 'getFeatures']:
-            result = getattr(tf, func)()
-            if result:
-                self._info.update(result)
-#        print self._info
-#        print tf._info
-        #print tf.__dict__
-        self._info = tf._info
+        with TypeFile(self.image_path, self.image_mime) as tf:
+            import copy
+            tf.__dict__.update(copy.deepcopy(self.__dict__))
+            for func in ['getProperties', 'getFeatures']:
+                result = getattr(tf, func)()
+                if result:
+                    self._info.update(result)
+            print self._info
+            print tf._info
+            #print tf.__dict__
+            self._info      = tf._info
+            self.image_size = tf.image_size
def _existInformation(self, info, ignore = ['Properties', 'ColorAverage']):
         result = []