SVN: [11621] trunk/pywikipedia/catimages.py - Pywikipedia-svn

7 Jun 2013

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11621
Revision: 11621
Author:   drtrigon
Date:     2013-06-07 14:53:04 +0000 (Fri, 07 Jun 2013)
Log Message:
-----------
improvement; start of code structure clean-up
Modified Paths:
--------------
    trunk/pywikipedia/catimages.py
Modified: trunk/pywikipedia/catimages.py
===================================================================

--- trunk/pywikipedia/catimages.py	2013-06-06 21:19:06 UTC (rev 11620)
+++ trunk/pywikipedia/catimages.py	2013-06-07 14:53:04 UTC (rev 11621)
@@ -143,7 +143,199 @@
# all detection and recognition methods - bindings to other classes, modules and libs
-class FileData(object):
+
+class UnknownFile(object):
+    def __init__(self, filename, *args, **kwargs):
+        self.filename = filename
+        self._info = {}
+
+    def getProperties(self):
+        # check/look into the file by midnight commander (mc)
+        # https://pypi.python.org/pypi/hachoir-metadata
+        #self._detect_HeaderAndMetadata()
+
+        # get mime-type file-size, ...
+        pass
+
+    def getFeatures(self):
+        pass
+
+    def _detect_Properties_PIL(self):
+        """Retrieve as much file property info possible, especially the same
+           as commons does in order to compare if those libraries (ImageMagick,
+           ...) are buggy (thus explicitely use other software for independence)"""
+        #self.image_size = (None, None)
+        result = {'Format': u'-', 'Pages': 0}
+        if self.image_fileext == u'.svg':   # MIME: 'application/xml; charset=utf-8'
+            # similar to PDF page count OR use BeautifulSoup
+            svgcountpages = re.compile("<page>")
+            pc = len(svgcountpages.findall( file(self.image_path,"r").read() ))
+
+            #svg = rsvg.Handle(self.image_path)
+
+            # http://validator.w3.org/docs/api.html#libs
+            # http://pypi.python.org/pypi/py_w3c/
+            vld = HTMLValidator()
+            valid = u'SVG'
+            try:
+                vld.validate(self.image.fileUrl())
+                valid = (u'Valid SVG' if vld.result.validity == 'true' else u'Invalid SVG')
+            except urllib2.URLError:
+                pass
+            except ValidationFault:
+                pass
+            #print vld.errors, vld.warnings
+
+            #self.image_size = (svg.props.width, svg.props.height)
+
+            result = { 'Format':     valid,
+                       'Mode':       u'-',
+                       'Palette':    u'-',
+                       'Pages':      pc, }
+            # may be set {{validSVG}} also or do something in bot template to
+            # recognize 'Format=SVG (valid)' ...
+        elif self.image_mime[1] == 'pdf':   # MIME: 'application/pdf; charset=binary'
+            # http://code.activestate.com/recipes/496837-count-pdf-pages/
+            #rxcountpages = re.compile(r"$\s*/Type\s*/Page[/\s]", re.MULTILINE|re.DOTALL)
+            rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)    # PDF v. 1.3,1.4,1.5,1.6
+            pc = len(rxcountpages.findall( file(self.image_path,"rb").read() ))
+
+            result = { 'Format':     u'PDF',
+                       'Mode':       u'-',
+                       'Palette':    u'-',
+                       'Pages':      pc, }
+        elif self.image_mime[1] == 'x-xcf': # MIME: 'image/x-xcf; charset=binary'
+            result = { 'Format': u'%s' % self.image_mime[1].upper() }
+            # DO NOT use ImageMagick (identify) instead of PIL to get these info !!
+        elif (self.image_mime[0] == 'image') and \
+             (self.image_mime[1] != 'vnd.djvu'):   # MIME: 'image/jpeg; charset=binary', ...
+            try:
+                i = Image.open(self.image_path)
+            except IOError:
+                pywikibot.warning(u'unknown (image) file type [_detect_Properties_PIL]')
+                return {'Properties': [result]}
+
+            # http://mail.python.org/pipermail/image-sig/1999-May/000740.html
+            pc=0         # count number of pages
+            while True:
+                try:
+                    i.seek(pc)
+                except EOFError:
+                    break
+                pc+=1
+            i.seek(0)    # restore default
+
+            # http://grokbase.com/t/python/image-sig/082psaxt6k/embedded-icc-profiles
+            # python-lcms (littlecms) may be freeimage library
+            #icc = i.app['APP2']     # jpeg
+            #icc = i.tag[34675]      # tiff
+            #icc = re.sub('[^%s]'%string.printable, ' ', icc)
+            ## more image formats and more post-processing needed...
+
+            #self.image_size = i.size
+
+            result = { #'bands':      i.getbands(),
+                       #'bbox':       i.getbbox(),
+                       'Format':     i.format,
+                       'Mode':       i.mode,
+                       #'info':       i.info,
+                       #'stat':       os.stat(self.image_path),
+                       'Palette':    str(len(i.palette.palette)) if i.palette else u'-',
+                       'Pages':      pc, }
+        elif self.image_mime[1] == 'ogg':   # MIME: 'application/ogg; charset=binary'
+            # 'ffprobe' (ffmpeg); audio and video streams files (ogv, oga, ...)
+            d = self._util_get_DataStreams_FFMPEG()
+            #print d
+            result['Format'] = u'%s' % d['format']['format_name'].upper()
+        elif self.image_mime[1] == 'midi': # MIME: 'audio/midi; charset=binary'
+            result['Format'] = u'%s' % self.image_mime[1].upper()
+        # djvu: python-djvulibre or python-djvu for djvu support
+        # http://pypi.python.org/pypi/python-djvulibre/0.3.9
+        else:
+            pywikibot.warning(u'unknown (generic) file type [_detect_Properties_PIL]')
+            return {'Properties': [result]}
+
+        result['Dimensions'] = self.image_size
+        result['Filesize']   = os.path.getsize(self.image_path)
+        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+
+        #self._info['Properties'] = [result]
+        self._info['Properties'][0].update(result)
+        return {'Properties': [result]}
+
+
+class JpegFile(UnknownFile):
+    # for '_detect_Trained_CV'
+    cascade_files = [(u'Legs', 'haarcascade_lowerbody.xml'),
+                     (u'Torsos', 'haarcascade_upperbody.xml'),
+                     (u'Ears', 'haarcascade_mcs_leftear.xml'),
+                     (u'Ears', 'haarcascade_mcs_rightear.xml'),
+                     (u'Eyes', 'haarcascade_lefteye_2splits.xml'),        # (http://yushiqi.cn/research/eyedetection)
+                     (u'Eyes', 'haarcascade_righteye_2splits.xml'),       # (http://yushiqi.cn/research/eyedetection)
+                     #externals/opencv/haarcascades/haarcascade_mcs_lefteye.xml
+                     #externals/opencv/haarcascades/haarcascade_mcs_righteye.xml
+                     # (others include indifferent (left and/or right) and pair)
+                     (u'Automobiles', 'cars3.xml'),                       # http://www.youtube.com/watch?v=c4LobbqeKZc
+                     (u'Hands', '1256617233-2-haarcascade-hand.xml', 300.),]    # http://www.andol.info/
+                     # ('Hands' does not behave very well, in fact it detects any kind of skin and other things...)
+                     #(u'Aeroplanes', 'haarcascade_aeroplane.xml'),]      # e.g. for 'Category:Unidentified aircraft'
+
+    def getProperties(self):
+        pass
+
+        # Image size
+        self._detect_Properties_PIL()
+
+    def getFeatures(self):
+        # Faces (extract EXIF data)
+        self._detect_Faces_EXIF()
+        # Faces and eyes (opencv pre-trained haar)
+        self._detect_Faces_CV()
+# TODO: test and use or switch off
+        # Face via Landmark(s)
+#        self._detect_FaceLandmark_xBOB()
+        # exclude duplicates (CV and EXIF)
+        faces = [item['Position'] for item in self._info['Faces']]
+        for i in self._util_merge_Regions(faces)[1]:
+            del self._info['Faces'][i]
+
+        # Segments and colors
+        self._detect_SegmentColors_JSEGnPIL()
+        # Average color
+        self._detect_AverageColor_PILnCV()
+
+        # People/Pedestrian (opencv pre-trained hog and haarcascade)
+        self._detect_People_CV()
+
+        # Geometric object (opencv hough line, circle, edges, corner, ...)
+        self._detect_Geometry_CV()
+
+        # general (opencv pre-trained, third-party and self-trained haar
+        # and cascade) classification
+        # http://www.computer-vision-software.com/blog/2009/11/faq-opencv-haartraining...
+        for cf in self.cascade_files:
+            self._detect_Trained_CV(*cf)
+
+        # optical and other text recognition (tesseract & ocropus, ...)
+        self._detect_EmbeddedText_poppler()
+#        self._recognize_OpticalText_ocropus()
+        # (may be just classify as 'contains text', may be store text, e.g. to wikisource)
+
+        # barcode and Data Matrix recognition (libdmtx/pydmtx, zbar, gocr?)
+        self._recognize_OpticalCodes_dmtxNzbar()
+
+        # Chessboard (opencv reference detector)
+        self._detect_Chessboard_CV()
+
+        # general (self-trained) detection WITH classification
+        # BoW: uses feature detection (SIFT, SURF, ...) AND classification (SVM, ...)
+#        self._detectclassify_ObjectAll_CV()
+        # Wavelet: uses wavelet transformation AND classification (machine learning)
+#        self._detectclassify_ObjectAll_PYWT()
+
+        # general file EXIF history information
+        self._detect_History_EXIF()
+
     # .../opencv/samples/c/facedetect.cpp
     # http://opencv.willowgarage.com/documentation/python/genindex.html
     def _detect_Faces_CV(self):
@@ -157,7 +349,8 @@
         # http://www.cognotics.com/opencv/servo_2007_series/part_4/index.html
# skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']):
+        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
+           (self.image_mime[0] in ['audio']):
             return
# https://code.ros.org/trac/opencv/browser/trunk/opencv_extra/testdata/gpu/haa...
@@ -699,7 +892,8 @@
         self._info['Geometry'] = []
# skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']):
+        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
+           (self.image_mime[0] in ['audio']):
             return
result = self._util_get_Geometry_CVnSCIPY()
@@ -970,7 +1164,8 @@
         self._info['ColorRegions'] = []
# skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']):
+        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
+           (self.image_mime[0] in ['audio']):
             return
try:
@@ -1032,7 +1227,8 @@
         self._info['ColorAverage'] = []
# skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']):
+        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
+           (self.image_mime[0] in ['audio']):
             return
try:
@@ -1050,109 +1246,6 @@
         self._info['ColorAverage'] = [result]
         return
-    def _detect_Properties_PIL(self):
-        """Retrieve as much file property info possible, especially the same
-           as commons does in order to compare if those libraries (ImageMagick,
-           ...) are buggy (thus explicitely use other software for independence)"""
-        #self.image_size = (None, None)
-        self._info['Properties'] = [{'Format': u'-', 'Pages': 0}]
-        if self.image_fileext == u'.svg':   # MIME: 'application/xml; charset=utf-8'
-            # similar to PDF page count OR use BeautifulSoup
-            svgcountpages = re.compile("<page>")
-            pc = len(svgcountpages.findall( file(self.image_path,"r").read() ))
-
-            #svg = rsvg.Handle(self.image_path)
-
-            # http://validator.w3.org/docs/api.html#libs
-            # http://pypi.python.org/pypi/py_w3c/
-            vld = HTMLValidator()
-            valid = u'SVG'
-            try:
-                vld.validate(self.image.fileUrl())
-                valid = (u'Valid SVG' if vld.result.validity == 'true' else u'Invalid SVG')
-            except urllib2.URLError:
-                pass
-            except ValidationFault:
-                pass
-            #print vld.errors, vld.warnings
-
-            #self.image_size = (svg.props.width, svg.props.height)
-
-            result = { 'Format':     valid,
-                       'Mode':       u'-',
-                       'Palette':    u'-',
-                       'Pages':      pc, }
-            # may be set {{validSVG}} also or do something in bot template to
-            # recognize 'Format=SVG (valid)' ...
-        elif self.image_mime[1] == 'pdf':   # MIME: 'application/pdf; charset=binary'
-            # http://code.activestate.com/recipes/496837-count-pdf-pages/
-            #rxcountpages = re.compile(r"$\s*/Type\s*/Page[/\s]", re.MULTILINE|re.DOTALL)
-            rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)    # PDF v. 1.3,1.4,1.5,1.6
-            pc = len(rxcountpages.findall( file(self.image_path,"rb").read() ))
-
-            result = { 'Format':     u'PDF',
-                       'Mode':       u'-',
-                       'Palette':    u'-',
-                       'Pages':      pc, }
-        elif (self.image_mime[0] == 'image') and \
-             (self.image_mime[1] != 'vnd.djvu'):   # MIME: 'image/jpeg; charset=binary', ...
-            try:
-                i = Image.open(self.image_path)
-            except IOError:
-                pywikibot.warning(u'unknown (image) file type [_detect_Properties_PIL]')
-                return
-
-            # http://mail.python.org/pipermail/image-sig/1999-May/000740.html
-            pc=0         # count number of pages
-            while True:
-                try:
-                    i.seek(pc)
-                except EOFError:
-                    break
-                pc+=1
-            i.seek(0)    # restore default
-
-            # http://grokbase.com/t/python/image-sig/082psaxt6k/embedded-icc-profiles
-            # python-lcms (littlecms) may be freeimage library
-            #icc = i.app['APP2']     # jpeg
-            #icc = i.tag[34675]      # tiff
-            #icc = re.sub('[^%s]'%string.printable, ' ', icc)
-            ## more image formats and more post-processing needed...
-
-            #self.image_size = i.size
-
-            result = { #'bands':      i.getbands(),
-                       #'bbox':       i.getbbox(),
-                       'Format':     i.format,
-                       'Mode':       i.mode,
-                       #'info':       i.info,
-                       #'stat':       os.stat(self.image_path),
-                       'Palette':    str(len(i.palette.palette)) if i.palette else u'-',
-                       'Pages':      pc, }
-        elif self.image_mime[1] == 'ogg':   # MIME: 'application/ogg; charset=binary'
-            # 'ffprobe' (ffmpeg); audio and video streams files (ogv, oga, ...)
-            d = self._util_get_DataStreams_FFMPEG()
-            #print d
-            result = { 'Format': u'%s' % d['format']['format_name'].upper() }
-        elif self.image_mime[0] == 'audio': # MIME: 'audio/midi; charset=binary'
-            result = {}
-        # djvu: python-djvulibre or python-djvu for djvu support
-        # http://pypi.python.org/pypi/python-djvulibre/0.3.9
-        #elif self.image_fileext == u'.xcf'
-        #    result = {}
-        #    # DO NOT use ImageMagick (identify) instead of PIL to get these info !!
-        else:
-            pywikibot.warning(u'unknown (generic) file type [_detect_Properties_PIL]')
-            return
-
-        result['Dimensions'] = self.image_size
-        result['Filesize']   = os.path.getsize(self.image_path)
-        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
-
-        #self._info['Properties'] = [result]
-        self._info['Properties'][0].update(result)
-        return
-
     # http://stackoverflow.com/questions/2270874/image-color-detection-using-pytho...
     # https://gist.github.com/1246268
     # colormath-1.0.8/examples/delta_e.py, colormath-1.0.8/examples/conversions.py
@@ -1413,7 +1506,8 @@
         self._info[info_desc] = []
# skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']):
+        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
+           (self.image_mime[0] in ['audio']):
             return
# http://tutorial-haartraining.googlecode.com/svn/trunk/data/haarcascades/
@@ -1634,7 +1728,8 @@
         self._info['OpticalCodes'] = []
# skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']):
+        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
+           (self.image_mime[0] in ['audio']):
             return
# DataMatrix
@@ -1734,7 +1829,8 @@
         self._info['Chessboard'] = []
# skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']):
+        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
+           (self.image_mime[0] in ['audio']):
             return
scale = 1.
@@ -2306,45 +2402,6 @@
return self._buffer_FFMPEG
-    def _detect_Streams_FFMPEG(self):
-        self._info['Streams'] = []
-
-        # skip file formats that interfere and can cause strange results (pdf is oga?!)
-        # or file formats not supported (yet?)
-        if (self.image_mime[1] in ['pdf']) or (self.image_fileext in [u'.svg']):
-            return
-
-        # audio and video streams files (ogv, oga, ...)
-        d = self._util_get_DataStreams_FFMPEG()
-        if not d:
-            return
-
-        result = []
-        for s in d['streams']:
-            #print s
-            if   (s["codec_type"] == "video"):
-                rate = s["avg_frame_rate"]
-                dim = (int(s["width"]), int(s["height"]))
-                #asp  = s["display_aspect_ratio"]
-            elif (s["codec_type"] == "audio"):
-# switch this part off since 'ffprobe' (on toolserver) is too old
-#                rate = u'%s/%s/%s' % (s["channels"], s["sample_fmt"], s["sample_rate"])
-                rate = u'%s/%s/%s' % (s["channels"], u'-', int(float(s["sample_rate"])))
-                dim  = None
-            elif (s["codec_type"] == "data"):
-                rate = None
-                dim  = None
-
-            result.append({ 'ID':         int(s["index"]) + 1,
-                            'Format':     u'%s/%s' % (s["codec_type"], s.get("codec_name",u'?')),
-                            'Rate':       rate or u'-',
-                            'Dimensions': dim or (None, None),
-                            })
-
-        if 'image' not in d["format"]["format_name"]:
-            self._info['Streams'] = result
-        return
-
     def _util_merge_Regions(self, regs, sub=False, overlap=False, close=False):
         # sub=False, overlap=False, close=False ; level 0 ; similar regions, similar position (default)
         # sub=True,  overlap=False, close=False ; level 1 ; region contained in other, any shape/size
@@ -2410,70 +2467,74 @@
return (regs, drop)
-    def _detect_AudioFeatures_MUSIC21(self):
-        # skip file formats not supported
-        if (self.image_mime[1] not in ['midi']):
-            return
-        self._info['Audio'] = []
+class PngFile(JpegFile):
+    pass
-        import _music21 as music21
+class GifFile(JpegFile):
+    pass
-        #audiofile = '/home/ursin/Desktop/3_Ships.mid'
-        audiofile = self.image_path
+class TiffFile(JpegFile):
+    pass
-        #music21.features.jSymbolic.getCompletionStats()
-        try:
-            #s = music21.converter.ConverterMidi()
-            #s.parseFile(audiofile)
-            #s = s.stream
-            s = music21.midi.translate.midiFilePathToStream(audiofile)
-        except music21.midi.base.MidiException:
-            pywikibot.warning(u'unknown file type [_detect_AudioFeatures_MUSIC21]')
+class XcfFile(JpegFile):
+    pass
+
+class SvgFile(JpegFile):
+    pass
+
+class PdfFile(JpegFile):
+    pass
+
+#class DjvuFile(JpegFile):
+#    pass
+
+
+class OggFile(JpegFile):
+    def getFeatures(self):
+        # general handling of all audio and video formats
+        self._detect_Streams_FFMPEG()
+
+        # general audio feature extraction
+#        self._detect_AudioFeatures_YAAFE()
+
+    def _detect_Streams_FFMPEG(self):
+        self._info['Streams'] = []
+
+        # skip file formats that interfere and can cause strange results (pdf is oga?!)
+        # or file formats not supported (yet?)
+        if (self.image_mime[1] in ['pdf']) or (self.image_fileext in [u'.svg']):
             return
-        #fs = music21.features.jSymbolic.extractorsById
-        #for k in fs:
-        #    for i in range(len(fs[k])):
-        #        if fs[k][i] is not None:
-        #            n = fs[k][i].__name__
-        #            if fs[k][i] not in music21.features.jSymbolic.featureExtractors:
-        #                n += " (not implemented)"
-        #                print k, i, n
-        #            else:
-        #                fe = fs[k][i](s)
-        #                print k, i, n,
-        #                try:
-        #                    f = fe.extract()
-        #                    print f.name, f.vector
-        #                except AttributeError:
-        #                    print "ERROR"
-        data = {}
-        for item in ['MostCommonPitchFeature',
-                     'ImportanceOfBassRegisterFeature',
-                     'ImportanceOfMiddleRegisterFeature',
-                     'ImportanceOfHighRegisterFeature',
-                     'AverageNoteDurationFeature',
-                     'MaximumNoteDurationFeature',
-                     #'DurationFeature',
-                     'InitialTempoFeature',
-                     'MaximumNumberOfIndependentVoicesFeature',
-                     'AverageNumberOfIndependentVoicesFeature',]:
-            fe = getattr(music21.features.jSymbolic, item)(s)
-            f = fe.extract()
-            #data[f.name] = f.vector[0]
-            data[item.replace('Feature', '')] = f.vector[0]
-        #print s.duration
-        data['Duration'] = s.highestTime
-        #print s.offsetMap
-        #print s.measureOffsetMap()
-        data['Metadata'] = s.metadata
-        data['Lyrics']   = s.lyrics()
-        #print s.seconds
-        #print s.secondsMap
+        # audio and video streams files (ogv, oga, ...)
+        d = self._util_get_DataStreams_FFMPEG()
+        if not d:
+            return
-        self._info['Audio'] = [data]
-#        print self._info['Audio']
+        result = []
+        for s in d['streams']:
+            #print s
+            if   (s["codec_type"] == "video"):
+                rate = s["avg_frame_rate"]
+                dim = (int(s["width"]), int(s["height"]))
+                #asp  = s["display_aspect_ratio"]
+            elif (s["codec_type"] == "audio"):
+# switch this part off since 'ffprobe' (on toolserver) is too old
+#                rate = u'%s/%s/%s' % (s["channels"], s["sample_fmt"], s["sample_rate"])
+                rate = u'%s/%s/%s' % (s["channels"], u'-', int(float(s["sample_rate"])))
+                dim  = None
+            elif (s["codec_type"] == "data"):
+                rate = None
+                dim  = None
+
+            result.append({ 'ID':         int(s["index"]) + 1,
+                            'Format':     u'%s/%s' % (s["codec_type"], s.get("codec_name",u'?')),
+                            'Rate':       rate or u'-',
+                            'Dimensions': dim or (None, None),
+                            })
+
+        if 'image' not in d["format"]["format_name"]:
+            self._info['Streams'] = result
         return
def _detect_AudioFeatures_YAAFE(self):
@@ -2634,10 +2695,125 @@
         return
+class MidiFile(UnknownFile):
+    def getProperties(self):
+        result = {}
+        result.update( self._detect_HeaderAndMetadata() )   # Metadata
+        result.update( self._detect_Properties_PIL() )      # Properties
+        return result
+
+    def getFeatures(self):
+        result = {}
+        result.update( self._detect_AudioFeatures_MUSIC21() )   # Audio
+        return result
+
+    def _detect_HeaderAndMetadata(self):
+        result = {}
+
+        # extract data from midi file
+        # http://valentin.dasdeck.com/midi/midifile.htm
+        # http://stackoverflow.com/questions/3943149/reading-and-interpreting-data-fro...
+        ba = bytearray(open(self.filename, 'rb').read())
+        i = -1
+        for key, data in [('Text', '\x01'), ('Copyright', '\x02'), ('Lyrics', '\x05')]:
+            result[key] = []
+            while True:
+                i = ba.find('\xff%s' % data, i+1)
+                if i < 0:       # something found?
+                    break
+                e = (i+3+ba[i+2])
+                if ba[e] != 0:  # length match with string end (00)?
+                    e = ba.find('\x00', (i+3+ba[i+2]))
+                result[key].append(ba[i+3:e].decode('latin-1').strip())
+            result[key] = u'\n'.join(result[key])
+
+        import _music21 as music21
+        try:
+            s = music21.converter.parse(self.filename)
+            if s.metadata:
+                pywikibot.output(unicode(s.metadata))
+                result['Metadata'].update(s.metadata)
+        except music21.midi.base.MidiException:
+            pass
+
+        self._info['Metadata'] = [result]
+        return {'Metadata': [result]}
+
+    # midi audio feature extraction
+    def _detect_AudioFeatures_MUSIC21(self):
+        # skip file formats not supported
+        if (self.image_mime[1] not in ['midi']):
+            return
+
+        import _music21 as music21
+
+        #audiofile = '/home/ursin/Desktop/3_Ships.mid'
+        audiofile = self.image_path
+
+        #music21.features.jSymbolic.getCompletionStats()
+        try:
+            #s = music21.midi.translate.midiFilePathToStream(audiofile)
+            s = music21.converter.parse(audiofile)
+        except music21.midi.base.MidiException:
+            pywikibot.warning(u'unknown file type [_detect_AudioFeatures_MUSIC21]')
+            return
+
+        #fs = music21.features.jSymbolic.extractorsById
+        #for k in fs:
+        #    for i in range(len(fs[k])):
+        #        if fs[k][i] is not None:
+        #            n = fs[k][i].__name__
+        #            if fs[k][i] not in music21.features.jSymbolic.featureExtractors:
+        #                n += " (not implemented)"
+        #                print k, i, n
+        #            else:
+        #                fe = fs[k][i](s)
+        #                print k, i, n,
+        #                try:
+        #                    f = fe.extract()
+        #                    print f.name, f.vector
+        #                except AttributeError:
+        #                    print "ERROR"
+        data = {'RegisterImportance': (music21.features.jSymbolic.ImportanceOfBassRegisterFeature(s).extract().vector[0],
+                                       music21.features.jSymbolic.ImportanceOfMiddleRegisterFeature(s).extract().vector[0],
+                                       music21.features.jSymbolic.ImportanceOfHighRegisterFeature(s).extract().vector[0],),
+                      'NoteDuration': (music21.features.jSymbolic.AverageNoteDurationFeature(s).extract().vector[0],
+                                       music21.features.jSymbolic.MaximumNoteDurationFeature(s).extract().vector[0],),
+                 'IndependentVoices': (music21.features.jSymbolic.AverageNumberOfIndependentVoicesFeature(s).extract().vector[0],
+                                       music21.features.jSymbolic.MaximumNumberOfIndependentVoicesFeature(s).extract().vector[0],),
+                   'MostCommonPitch': music21.features.jSymbolic.MostCommonPitchFeature(s).extract().vector[0],
+                             'Tempo': music21.features.jSymbolic.InitialTempoFeature(s).extract().vector[0],
+                          'Duration': s.highestTime,
+                          #'Metadata': s.metadata if s.metadata else u'',
+                            'Lyrics': s.lyrics(recurse=True) if s.lyrics(recurse=True) else u'',}
+        #print music21.text.assembleLyrics(s)
+        #print s.duration
+        #print s.offsetMap
+        #print s.measureOffsetMap()
+        #print s.seconds
+        #print s.secondsMap
+
+        self._info['Audio'] += [data]
+        return {'Audio': [data]}
+
+
+FILETYPES = {                        '*': UnknownFile,
+             (      'image',     'jpeg'): JpegFile,
+             (      'image',      'png'): PngFile,
+             (      'image',      'gif'): GifFile,
+             (      'image',     'tiff'): TiffFile,
+             (      'image',    'x-xcf'): XcfFile,
+             (      'image',  'svg+xml'): SvgFile,
+             ('application',      'pdf'): PdfFile,
+#             (      'image', 'vnd.djvu'): DjvuFile,
+             ('application',      'ogg'): OggFile,
+             (      'audio',     'midi'): MidiFile,}
+
+
 # all classification and categorization methods and definitions - default variation
 #  use simplest classification I can think of (self-made) and do categorization
 #  mostly based on filtered/reported features
-class CatImages_Default(FileData):
+class CatImages_Default(object):
     #ignore = []
     ignore = ['color']
@@ -2854,6 +3030,21 @@
return (u'Graphics', bool(relevance))
+#    # Category:MIDI files created with GNU LilyPond
+#    def _cat_audio_MIDIfilescreatedwithGNULilyPond(self):
+#        # find metadata in extracted data
+#        print [item.strip() for item in re.findall('Generated .*?\n', u'\n'.join(result['Text']))]
+#        #u"Cr'eateur: GNU LilyPond 2.0.1"
+#        import dateutil.parser
+#        dates = []
+#        for line in result['Text']:
+#            # http://stackoverflow.com/questions/3276180/extracting-date-from-a-string-in-...
+#            try:
+#                dates.append(dateutil.parser.parse(line, fuzzy=True).isoformat(' ').decode('utf-8'))
+#            except ValueError:
+#                pass
+#        print dates
+
     # Category:Categorized by DrTrigonBot
     def _addcat_BOT(self):
         # - ALWAYS -
@@ -3099,7 +3290,7 @@
         self.image_mime = re.split('[/;\s]', m.file(self.image_path))
         #self.image_size = (None, None)
         mime = mimetypes.guess_all_extensions('%s/%s' % tuple(self.image_mime[0:2]))
-        if self.image_fileext.lower() not in mime:
+        if mime and (self.image_fileext.lower() not in mime):
             pywikibot.warning(u'File extension does not match MIME type! File extension should be %s.' % mime)
# SVG: rasterize the SVG to bitmap (MAY BE GET FROM WIKI BY DOWNLOAD?...)
@@ -3146,6 +3337,9 @@
             #data = Popen("identify -verbose info: %s" % self.image_path,
             #             shell=True, stderr=PIPE).stderr.read()
             #print data
+            if not os.path.exists(self.image_path_JPEG):
+                # xcf can have more than 1 layer/page like gif, tiff, and movies...
+                self.image_path_JPEG = self.image_path_JPEG.replace('.jpg', '-0.jpg')
             self.image_size = Image.open(self.image_path_JPEG).size
         else:
             try:
@@ -3168,6 +3362,8 @@
self.image_size = (img.shape[1], img.shape[0])
                     except:
+                        if os.path.exists(self.image_path_JPEG):
+                            os.remove(self.image_path_JPEG)
                         self.image_path_JPEG = self.image_path
             except:
                 self.image_path_JPEG = self.image_path
@@ -3502,69 +3698,38 @@
# gather data from all information interfaces
     def gatherFeatures(self):
-        # Image size
-        self._detect_Properties_PIL()
-        
+        self._info['Properties'] = [{'Format': u'-', 'Pages': 0}]
+        self._info['Metadata'] = []
+        self._info['ColorAverage'] = []
+        self._info['ColorRegions'] = []
         self._info['Faces'] = []
-        # Faces (extract EXIF data)
-        self._detect_Faces_EXIF()
-        # Faces and eyes (opencv pre-trained haar)
-        self._detect_Faces_CV()
-# TODO: test and use or switch off
-        # Face via Landmark(s)
-#        self._detect_FaceLandmark_xBOB()
-        # exclude duplicates (CV and EXIF)
-        faces = [item['Position'] for item in self._info['Faces']]
-        for i in self._util_merge_Regions(faces)[1]:
-            del self._info['Faces'][i]
+        self._info['OpticalCodes'] = []
+        self._info['People'] = []
+        self._info['Chessboard'] = []
+        self._info['Text'] = []
+        self._info['Streams'] = []
+        self._info['Audio'] = []
+        self._info['Legs'] = []
+        self._info['Hands'] = []
+        self._info['Torsos'] = []
+        self._info['Ears'] = []
+        self._info['Eyes'] = []
+        self._info['Automobiles'] = []
-        # Segments and colors
-        self._detect_SegmentColors_JSEGnPIL()
-        # Average color
-        self._detect_AverageColor_PILnCV()
+        # split detection and extraction according to file types; JpegFile, ...
+        TypeFile = FILETYPES.get(tuple(self.image_mime[:2]), FILETYPES['*'])
+        tf = TypeFile(self.image_path)
+        import copy
+        tf.__dict__.update(copy.deepcopy(self.__dict__))
+        for func in ['getProperties', 'getFeatures']:
+            result = getattr(tf, func)()
+            if result:
+                self._info.update(result)
+#        print self._info
+#        print tf._info
+        #print tf.__dict__
+        self._info = tf._info
-        # People/Pedestrian (opencv pre-trained hog and haarcascade)
-        self._detect_People_CV()
-
-        # Geometric object (opencv hough line, circle, edges, corner, ...)
-        self._detect_Geometry_CV()
-
-        # general (opencv pre-trained, third-party and self-trained haar
-        # and cascade) classification
-        # http://www.computer-vision-software.com/blog/2009/11/faq-opencv-haartraining...
-        for cf in self.cascade_files:
-            self._detect_Trained_CV(*cf)
-
-        # optical and other text recognition (tesseract & ocropus, ...)
-        self._detect_EmbeddedText_poppler()
-#        self._recognize_OpticalText_ocropus()
-        # (may be just classify as 'contains text', may be store text, e.g. to wikisource)
-
-        # barcode and Data Matrix recognition (libdmtx/pydmtx, zbar, gocr?)
-        self._recognize_OpticalCodes_dmtxNzbar()
-
-        # Chessboard (opencv reference detector)
-        self._detect_Chessboard_CV()
-
-        # general (self-trained) detection WITH classification
-        # BoW: uses feature detection (SIFT, SURF, ...) AND classification (SVM, ...)
-#        self._detectclassify_ObjectAll_CV()
-        # Wavelet: uses wavelet transformation AND classification (machine learning)
-#        self._detectclassify_ObjectAll_PYWT()
-
-        # general handling of all audio and video formats
-        self._detect_Streams_FFMPEG()
-
-        # general file EXIF history information
-        self._detect_History_EXIF()
-
-        # general audio feature extraction
-#        self._detect_AudioFeatures_YAAFE()
-
-        # midi audio feature extraction
-# TODO: improve midi/audio stuff ...
-        self._detect_AudioFeatures_MUSIC21()
-
     def _existInformation(self, info, ignore = ['Properties', 'ColorAverage']):
         result = []
         for item in info:
@@ -3684,6 +3849,11 @@
         result = self._info['Streams']
         return {'Streams': result}
+#    def _filter_Audio(self):
+#        # use all, (should be reliable)
+#        result = self._info['Audio']
+#        return {'Audio': result}
+
     #def _filter_Geometry(self):
     #    result = []
     #    for item in self._info['Geometry']: