SVN: [11623] trunk/pywikipedia/catimages.py - Pywikipedia-svn

8 Jun 2013

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11623
Revision: 11623
Author:   drtrigon
Date:     2013-06-07 22:29:18 +0000 (Fri, 07 Jun 2013)
Log Message:
-----------
improvement; code cleanup and class conversion more or less finished (method renaming not done yet)
new feature; initial audio midi support (music21) and own metadata part (Category:MIDI files created with GNU LilyPond)
Modified Paths:
--------------
    trunk/pywikipedia/catimages.py
Modified: trunk/pywikipedia/catimages.py
===================================================================

--- trunk/pywikipedia/catimages.py	2013-06-07 18:21:31 UTC (rev 11622)
+++ trunk/pywikipedia/catimages.py	2013-06-07 22:29:18 UTC (rev 11623)
@@ -149,7 +149,27 @@
         self.filename   = filename
         self.image_size = (None, None)
-        self._info = {}
+        # available file properties and metadata
+        self._properties = { 'Properties':   [{'Format': u'-', 'Pages': 0}],
+                             'Metadata':     [], }
+        # available feature to extract
+        self._features   = { 'ColorAverage': [],
+                             'ColorRegions': [],
+                             'Faces':        [],
+                             'People':       [],
+                             'OpticalCodes': [],
+                             'Chessboard':   [],
+                             'History':      [],
+                             'Text':         [],
+                             'Streams':      [],
+                             'Audio':        [],
+                             'Legs':         [],
+                             'Hands':        [],
+                             'Torsos':       [],
+                             'Ears':         [],
+                             'Eyes':         [],
+                             'Automobiles':  [],
+                             'Classify':     [], }
def __enter__(self):
         return self
@@ -158,22 +178,22 @@
         pass
def getProperties(self):
-        result = {}
-        result.update( self._detect_HeaderAndMetadata() )   # Metadata
-        result.update( self._detect_Properties_PIL() )      # Properties
-        return result
+        self._detect_HeaderAndMetadata()    # Metadata
+        self._detect_Properties()           # Properties
+        return self._properties
def getFeatures(self):
         pywikibot.warning(u"File format '%s/%s' not supported (yet)!" % tuple(self.image_mime[:2]))
+        return self._features
def _detect_HeaderAndMetadata(self):
         # check/look into the file by midnight commander (mc)
         # https://pypi.python.org/pypi/hachoir-metadata
-        return {}
+        pass
-    def _detect_Properties_PIL(self):
+    def _detect_Properties(self):
         # get mime-type file-size, ...
-        return {}
+        pass
class JpegFile(UnknownFile):
@@ -197,14 +217,14 @@
self.image_filename  = os.path.split(self.filename)[-1]
         self.image_fileext   = os.path.splitext(self.image_filename)[1]
-        self.image_path      = os.path.join(scriptdir, ('cache/' + self.image_filename[-128:]))
+        self.image_path      = self.filename
         self.image_path_JPEG = self.image_path + '.jpg'
self._convert()
def __exit__(self, type, value, traceback):
-        if os.path.exists(self.image_path):
-            os.remove( self.image_path )
+        #if os.path.exists(self.image_path):
+        #    os.remove( self.image_path )
         if os.path.exists(self.image_path_JPEG):
             os.remove( self.image_path_JPEG )
         #image_path_new = self.image_path_JPEG.replace(u"cache/", u"cache/0_DETECTED_")
@@ -220,9 +240,9 @@
         # Face via Landmark(s)
 #        self._detect_FaceLandmark_xBOB()
         # exclude duplicates (CV and EXIF)
-        faces = [item['Position'] for item in self._info['Faces']]
+        faces = [item['Position'] for item in self._features['Faces']]
         for i in self._util_merge_Regions(faces)[1]:
-            del self._info['Faces'][i]
+            del self._features['Faces'][i]
# Segments and colors
         self._detect_SegmentColors_JSEGnPIL()
@@ -241,11 +261,6 @@
         for cf in self.cascade_files:
             self._detect_Trained_CV(*cf)
-        # optical and other text recognition (tesseract & ocropus, ...)
-        self._detect_EmbeddedText_poppler()
-#        self._recognize_OpticalText_ocropus()
-        # (may be just classify as 'contains text', may be store text, e.g. to wikisource)
-
         # barcode and Data Matrix recognition (libdmtx/pydmtx, zbar, gocr?)
         self._recognize_OpticalCodes_dmtxNzbar()
@@ -260,6 +275,8 @@
# general file EXIF history information
         self._detect_History_EXIF()
+        
+        return self._features
# supports a lot of different file types thanks to PIL
     def _convert(self):
@@ -293,18 +310,18 @@
         # http://code.google.com/p/pylibtiff/
# MIME: 'image/jpeg; charset=binary', ...
-    def _detect_Properties_PIL(self):
+    def _detect_Properties(self):
         """Retrieve as much file property info possible, especially the same
            as commons does in order to compare if those libraries (ImageMagick,
            ...) are buggy (thus explicitely use other software for independence)"""
-        #self.image_size = (None, None)
+
         result = {'Format': u'-', 'Pages': 0}
try:
             i = Image.open(self.image_path)
         except IOError:
             pywikibot.warning(u'unknown file type [JpegFile]')
-            return {'Properties': [result]}
+            return
# http://mail.python.org/pipermail/image-sig/1999-May/000740.html
         pc=0         # count number of pages
@@ -325,23 +342,22 @@
#self.image_size = i.size
-        result = { #'bands':      i.getbands(),
-                   #'bbox':       i.getbbox(),
-                   'Format':     i.format,
-                   'Mode':       i.mode,
-                   #'info':       i.info,
-                   #'stat':       os.stat(self.image_path),
-                   'Palette':    str(len(i.palette.palette)) if i.palette else u'-',
-                   'Pages':      pc, }
+        result.update({ #'bands':      i.getbands(),
+                        #'bbox':       i.getbbox(),
+                        'Format':     i.format,
+                        'Mode':       i.mode,
+                        #'info':       i.info,
+                        #'stat':       os.stat(self.image_path),
+                        'Palette':    str(len(i.palette.palette)) if i.palette else u'-',
+                        'Pages':      pc,
+                        'Dimensions': self.image_size,
+                        'Filesize':   os.path.getsize(self.filename),
+                        'MIME':       u'%s/%s' % tuple(self.image_mime[:2]), })
-        result['Dimensions'] = self.image_size
-        result['Filesize']   = os.path.getsize(self.image_path)
-        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+        #self._properties['Properties'] = [result]
+        self._properties['Properties'][0].update(result)
+        return
-        #self._info['Properties'] = [result]
-        self._info['Properties'][0].update(result)
-        return {'Properties': [result]}
-
     # .../opencv/samples/c/facedetect.cpp
     # http://opencv.willowgarage.com/documentation/python/genindex.html
     def _detect_Faces_CV(self):
@@ -354,11 +370,6 @@
         # http://blog.jozilla.net/2008/06/27/fun-with-python-opencv-and-face-detection...
         # http://www.cognotics.com/opencv/servo_2007_series/part_4/index.html
-        # skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
-           (self.image_mime[0] in ['audio']):
-            return
-
         # https://code.ros.org/trac/opencv/browser/trunk/opencv_extra/testdata/gpu/haa...
         xml = os.path.join(scriptdir, 'externals/opencv/haarcascades/haarcascade_eye_tree_eyeglasses.xml')
         #xml = os.path.join(scriptdir, 'externals/opencv/haarcascades/haarcascade_eye.xml')
@@ -402,7 +413,6 @@
             raise IOError(u"No such file: '%s'" % xml)
         cascaderightear = cv2.CascadeClassifier(xml)
-        #self._info['Faces'] = []
         scale = 1.
         # So, to find an object of an unknown size in the image the scan
         # procedure should be done several times at different scales.
@@ -651,7 +661,7 @@
         #    cv2.imwrite( image_path_new, img )
#return faces.tolist()
-        self._info['Faces'] += result
+        self._features['Faces'] += result
         return
def _util_get_Pose_solvePnP(self, D3points, D2points, shape):
@@ -728,7 +738,6 @@
         """Prints the locations of any face landmark(s) found, respective
            converts them to usual face position data"""
-        #self._info['Faces'] = []
         scale = 1.
         try:
             #video = bob.io.VideoReader(self.image_path_JPEG.encode('utf-8'))
@@ -794,7 +803,7 @@
             #cv2.imshow("people detector", img)
             #cv2.waitKey()
-        self._info['Faces'] += result
+        self._features['Faces'] += result
         return
# .../opencv/samples/cpp/peopledetect.cpp
@@ -806,7 +815,6 @@
         # http://opencv.willowgarage.com/documentation/cpp/basic_structures.html
         # http://www.pygtk.org/docs/pygtk/class-gdkrectangle.html
-        self._info['People'] = []
         scale = 1.
         try:
             img = cv2.imread(self.image_path_JPEG, cv.CV_LOAD_IMAGE_COLOR)
@@ -891,22 +899,15 @@
         #cv2.imshow("people detector", img)
         #c = cv2.waitKey(0) & 255
-        self._info['People'] = result
+        self._features['People'] = result
         return
def _detect_Geometry_CV(self):
-        self._info['Geometry'] = []
-
-        # skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
-           (self.image_mime[0] in ['audio']):
-            return
-
         result = self._util_get_Geometry_CVnSCIPY()
-        self._info['Geometry'] = [{'Lines': result['Lines'],
-                                   'Circles': result['Circles'],
-                                   'Corners': result['Corners'],}]
+        self._features['Geometry'] = [{'Lines': result['Lines'],
+                                       'Circles': result['Circles'],
+                                       'Corners': result['Corners'],}]
         return
# https://code.ros.org/trac/opencv/browser/trunk/opencv/samples/python/houghli...
@@ -1083,8 +1084,6 @@
         # parts of code here should/have to be placed into e.g. a own
         # class in 'dtbext/opencv/__init__.py' script/module
-        self._info['Classify'] = []
-
         trained = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
                    'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
                    'horse', 'motorbike', 'person', 'pottedplant', 'sheep',
@@ -1123,7 +1122,7 @@
         # http://www.xrce.xerox.com/layout/set/print/content/download/18763/134049/fil...
         # http://people.csail.mit.edu/torralba/shortCourseRLOC/index.html
-        self._info['Classify'] = [dict([ (trained[i], r) for i, r in enumerate(result) ])]
+        self._features['Classify'] = [dict([ (trained[i], r) for i, r in enumerate(result) ])]
         return
def _detectclassify_ObjectAll_PYWT(self):
@@ -1167,13 +1166,6 @@
     # http://code.google.com/p/pymeanshift/wiki/Examples
     # (http://pythonvision.org/basic-tutorial, http://luispedro.org/software/mahotas, http://packages.python.org/pymorph/)
     def _detect_SegmentColors_JSEGnPIL(self):    # may be SLIC other other too...
-        self._info['ColorRegions'] = []
-
-        # skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
-           (self.image_mime[0] in ['audio']):
-            return
-
         try:
             #im = Image.open(self.image_path).convert(mode = 'RGB')
             im = Image.open(self.image_path_JPEG)
@@ -1220,7 +1212,7 @@
             result.append( data )
             i += 1
-        self._info['ColorRegions'] = result
+        self._features['ColorRegions'] = result
         return
# http://stackoverflow.com/questions/2270874/image-color-detection-using-pytho...
@@ -1230,13 +1222,6 @@
     # http://en.wikipedia.org/wiki/Color_difference
     # http://www.farb-tabelle.de/en/table-of-color.htm
     def _detect_AverageColor_PILnCV(self):
-        self._info['ColorAverage'] = []
-
-        # skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
-           (self.image_mime[0] in ['audio']):
-            return
-
         try:
             # we need to have 3 channels (but e.g. grayscale 'P' has only 1)
             #i = Image.open(self.image_path).convert(mode = 'RGB')
@@ -1249,7 +1234,7 @@
         result              = self._util_average_Color_colormath(h)
         result['Gradient']  = self._util_get_Geometry_CVnSCIPY().get('Edge_Ratio', None) or '-'
         result['FFT_Peaks'] = self._util_get_Geometry_CVnSCIPY().get('FFT_Peaks', None) or '-'
-        self._info['ColorAverage'] = [result]
+        self._features['ColorAverage'] = [result]
         return
# http://stackoverflow.com/questions/2270874/image-color-detection-using-pytho...
@@ -1509,13 +1494,6 @@
# analogue to face detection:
-        self._info[info_desc] = []
-
-        # skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
-           (self.image_mime[0] in ['audio']):
-            return
-
         # http://tutorial-haartraining.googlecode.com/svn/trunk/data/haarcascades/
         # or own xml files trained onto specific file database/set
         xml = os.path.join(scriptdir, ('externals/opencv/haarcascades/' + cascade_file))
@@ -1560,170 +1538,9 @@
# generic detection ...
-        self._info[info_desc] = result
+        self._features[info_desc] = result
         return
-    # ./run-test (ocropus/ocropy)
-    # (in fact all scripts/executables used here are pure python scripts!!!)
-    def _recognize_OpticalText_ocropus(self):
-        # optical text recognition (tesseract & ocropus, ...)
-        # (no full recognition but - at least - just classify as 'contains text')
-        # http://www.claraocr.org/de/ocr/ocr-software/open-source-ocr.html
-        # https://github.com/edsu/ocropy
-        # http://de.wikipedia.org/wiki/Benutzer:DrTrigonBot/Doku#Categorization
-        # Usage:tesseract imagename outputbase [-l lang] [configfile [[+|-]varfile]...]
-        # tesseract imagename.tif output
-
-        # (it's simpler to run the scripts/executables in own environment/interpreter...)
-
-        # skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']):
-            return
-
-        path = os.path.join(scriptdir, 'dtbext/_ocropus/ocropy')
-
-        curdir = os.path.abspath(os.curdir)
-        os.chdir(path)
-
-        # binarization
-        if os.path.exists(os.path.join(path, "temp")):
-            shutil.rmtree(os.path.join(path, "temp"))
-        if os.system("ocropus-nlbin %s -o %s" % (self.image_path_JPEG, os.path.join(path, "temp"))):
-            raise ImportError("ocropus not found!")
-        
-        # page level segmentation
-        if os.system("ocropus-gpageseg --minscale 6.0 '%s'" % os.path.join(path, "temp/????.bin.png")):
-            # detection error
-            return
-        
-        # raw text line recognition
-        if os.system("ocropus-lattices --writebestpath '%s'" % os.path.join(path, "temp/????/??????.bin.png")):
-            # detection error
-            return
-        
-        # language model application
-        # (optional - improve the raw results by applying a pretrained model)
-        os.environ['OCROPUS_DATA'] = os.path.join(path, "models/")
-        if os.system("ocropus-ngraphs '%s'" % os.path.join(path, "temp/????/??????.lattice")):
-            # detection error
-            return
-        
-        # create hOCR output
-        if os.system("ocropus-hocr '%s' -o %s" % (os.path.join(path, "temp/????.bin.png"), os.path.join(path, "temp.html"))):
-            # detection error
-            return
-        
-        ## 'create HTML for debugging (use "firefox temp/index.html" to view)'
-        ## (optional - generate human readable debug output)
-        #if os.system("ocropus-visualize-results %s" % os.path.join(path, "temp")):
-        #    # detection error
-        #    return
-        
-        # "to see recognition results, type: firefox temp.html"
-        # "to see details on the recognition process, type: firefox temp/index.html"
-        tmpfile = open(os.path.join(path, "temp.html"), 'r')
-        data = tmpfile.read()
-        tmpfile.close()
-
-        shutil.rmtree(os.path.join(path, "temp"))
-        os.remove(os.path.join(path, "temp.html"))
-
-        os.chdir(curdir)
-
-        #print data
-        pywikibot.output(data)
- 
-    def _detect_EmbeddedText_poppler(self):
-        # may be also: http://www.reportlab.com/software/opensource/rl-toolkit/
-
-        self._info['Text'] = []
-
-        if not (self.image_mime[1] == 'pdf'):
-            return
-
-        # poppler pdftotext/pdfimages
-        # (similar as in '_util_get_DataTags_EXIF' but with stderr and no json output)
-        # http://poppler.freedesktop.org/
-        # http://www.izzycode.com/bash/how-to-install-pdf2text-on-centos-fedora-redhat...
-        # MIGHT BE BETTER TO USE AS PYTHON MODULE:
-        # https://launchpad.net/poppler-python/
-        # http://stackoverflow.com/questions/2732178/extracting-text-from-pdf-with-pop...
-        # http://stackoverflow.com/questions/25665/python-module-for-converting-pdf-to...
-        #proc = Popen("pdftotext -layout %s %s" % (self.image_path, self.image_path+'.txt'), 
-        proc = Popen("pdftotext %s %s" % (self.image_path, self.image_path+'.txt'), 
-                     shell=True, stderr=PIPE)#.stderr.readlines()
-        proc.wait()
-        if proc.returncode:
-            raise ImportError("pdftotext not found!")
-        data = open(self.image_path+'.txt', 'r').readlines()
-        os.remove( self.image_path+'.txt' )
-
-#        self._content_text = data
-        (s1, l1) = (len(u''.join(data)), len(data))
-
-        tmp_path = os.path.join(os.environ.get('TMP', '/tmp'), 'DrTrigonBot/')
-        os.mkdir( tmp_path )
-# switch this part off since 'pdfimages' (on toolserver) is too old; TS-1449
-#        proc = Popen("pdfimages -p %s %s/" % (self.image_path, tmp_path), 
-        proc = Popen("pdfimages %s %s/" % (self.image_path, tmp_path), 
-                     shell=True, stderr=PIPE)#.stderr.readlines()
-        proc.wait()
-        if proc.returncode:
-            raise ImportError("pdfimages not found!")
-        images = os.listdir( tmp_path )
-#        pages  = set()
-        for f in images:
-#            pages.add( int(f.split('-')[1]) )
-            os.remove( os.path.join(tmp_path, f) )
-        os.rmdir( tmp_path )
-        
-        ## pdfminer (tools/pdf2txt.py)
-        ## http://denis.papathanasiou.org/?p=343 (for layout and images)
-        #debug = 0
-        #laparams = layout.LAParams()
-        ##
-        #pdfparser.PDFDocument.debug        = debug
-        #pdfparser.PDFParser.debug          = debug
-        #cmapdb.CMapDB.debug                = debug
-        #pdfinterp.PDFResourceManager.debug = debug
-        #pdfinterp.PDFPageInterpreter.debug = debug
-        #pdfdevice.PDFDevice.debug          = debug
-        ##
-        #rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
-        #outfp = StringIO.StringIO()
-        #device = converter.TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams)
-        ##device = converter.XMLConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams, outdir=None)
-        ##device = converter.HTMLConverter(rsrcmgr, outfp, codec='utf-8', scale=1,
-        ##                       layoutmode='normal', laparams=laparams, outdir=None)
-        ##device = pdfdevice.TagExtractor(rsrcmgr, outfp, codec='utf-8')
-        #fp = file(self.image_path, 'rb')
-        #try:
-        #    pdfinterp.process_pdf(rsrcmgr, device, fp, set(), maxpages=0, password='',
-        #                caching=True, check_extractable=False)
-        #except AssertionError:
-        #    pywikibot.warning(u'pdfminer missed, may be corrupt [_detect_EmbeddedText_poppler]')
-        #    return
-        #except TypeError:
-        #    pywikibot.warning(u'pdfminer missed, may be corrupt [_detect_EmbeddedText_poppler]')
-        #    return
-        #fp.close()
-        #device.close()
-        #data = outfp.getvalue().splitlines(True)
-        #
-        #(s2, l2) = (len(u''.join(data)), len(data))
-
-        result = { 'Size':     s1,
-                   'Lines':    l1,
-                   #'Data':     data,
-                   #'Position': pos,
-#                   'Images':   u'%s (on %s page(s))' % (len(images), len(list(pages))),  # pages containing images
-                   'Images':   u'%s' % len(images),
-                   'Type':     u'-', }  # 'Type' could be u'OCR' above...
-
-        self._info['Text'] = [result]
-
-        return
-
     def _recognize_OpticalCodes_dmtxNzbar(self):
         # barcode and Data Matrix recognition (libdmtx/pydmtx, zbar, gocr?)
         # http://libdmtx.wikidot.com/libdmtx-python-wrapper
@@ -1731,13 +1548,6 @@
         # http://zbar.sourceforge.net/
         # http://pypi.python.org/pypi/zbar
-        self._info['OpticalCodes'] = []
-
-        # skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
-           (self.image_mime[0] in ['audio']):
-            return
-
         # DataMatrix
         from pydmtx import DataMatrix   # linux distro package (fedora) / TS (debian)
@@ -1786,7 +1596,7 @@
                             'Type':     u'DataMatrix',
                             'Quality':  10, })
-        self._info['OpticalCodes'] = result
+        self._features['OpticalCodes'] = result
# supports many popular symbologies
         try:
@@ -1823,7 +1633,7 @@
# further detection ?
-        self._info['OpticalCodes'] = result
+        self._features['OpticalCodes'] = result
         return
def _detect_Chessboard_CV(self):
@@ -1832,13 +1642,6 @@
         # http://www.youtube.com/watch?v=bV-jAnQ-tvw
         # http://nullege.com/codes/show/src%40o%40p%40opencvpython-HEAD%40samples%40ch...
-        self._info['Chessboard'] = []
-
-        # skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg', 'pdf', 'vnd.djvu']) or \
-           (self.image_mime[0] in ['audio']):
-            return
-
         scale = 1.
         try:
             #im = cv.LoadImage(self.image_path_JPEG, cv.CV_LOAD_IMAGE_COLOR)
@@ -1877,9 +1680,10 @@
         ##cv2.imshow("win", im)
         ##cv2.waitKey()
+        result = {}
         if corners is not None:
-            self._info['Chessboard'] = [{ 'Corners': [tuple(item[0]) 
-                                                      for item in corners], }]
+            result = { 'Corners': [tuple(item[0]) for item in corners], }
+            self._features['Chessboard'] = [result]
# TODO: improve chessboard detection (make it more tolerant)
 #        ## http://stackoverflow.com/questions/7624765/converting-an-opencv-image-to-bla...
@@ -2019,10 +1823,10 @@
             pywikibot.output(u'result for calibrated camera:\n  rot=%s\n  perp=%s\n  perp2D=%s' % (rot.transpose()[0], perp[:,2], ortho))
             pywikibot.output(u'nice would be to do the same for uncalibrated/default cam settings')
-            self._info['Chessboard'][0].update({
-                                     'Rotation':    tuple(rot.transpose()[0]),
-                                     'Perp_Dir' :   tuple(perp[:,2]),
-                                     'Perp_Dir_2D': tuple(ortho), })
+            result.update({ 'Rotation':    tuple(rot.transpose()[0]),
+                            'Perp_Dir' :   tuple(perp[:,2]),
+                            'Perp_Dir_2D': tuple(ortho), })
+            self._features['Chessboard'] = [result]
#cv2.imshow("win", im)
         #cv2.waitKey()
@@ -2331,12 +2135,10 @@
# (exclusion of duplicates is done later by '_util_merge_Regions')
-        self._info['Faces'] += data
+        self._features['Faces'] += data
         return
def _detect_History_EXIF(self):
-        self._info['History'] = []
-
         res = self._util_get_DataTags_EXIF()
#a = []
@@ -2372,42 +2174,9 @@
                 pass
             i += 1
-        self._info['History'] = result
+        self._features['History'] = result
         return
-    def _util_get_DataStreams_FFMPEG(self):
-        if hasattr(self, '_buffer_FFMPEG'):
-            return self._buffer_FFMPEG
-
-        # (similar as in '_util_get_DataTags_EXIF')
-# switch this part off since 'ffprobe' (on toolserver) is too old; TS-1449
-#        data = Popen("ffprobe -v quiet -print_format json -show_format -show_streams %s" % self.image_path, 
-        proc = Popen("ffprobe -v quiet -show_format -show_streams %s" % self.image_path,#.replace('%', '%%'), 
-                     shell=True, stdout=PIPE)#.stdout.read()
-        proc.wait()
-        if proc.returncode == 127:
-            raise ImportError("ffprobe (ffmpeg) not found!")
-        data = proc.stdout.read().strip()
-#        self._buffer_FFMPEG = json.loads(data)
-        res, key, cur = {}, '', {}
-        for item in data.splitlines():
-            if (item[0] == '['):
-                if not (item[1] == '/'):
-                    key = item[1:-1]
-                    cur = {}
-                    if key not in res:
-                        res[key] = []
-                else:
-                    res[key].append( cur )
-            else:
-                val = item.split('=')
-                cur[val[0].strip()] = val[1].strip()
-        if res:
-            res = { 'streams': res['STREAM'], 'format': res['FORMAT'][0] }
-        self._buffer_FFMPEG = res
-        
-        return self._buffer_FFMPEG
-
     def _util_merge_Regions(self, regs, sub=False, overlap=False, close=False):
         # sub=False, overlap=False, close=False ; level 0 ; similar regions, similar position (default)
         # sub=True,  overlap=False, close=False ; level 1 ; region contained in other, any shape/size
@@ -2508,25 +2277,23 @@
         self.image_size = Image.open(self.image_path_JPEG).size
# MIME: 'image/x-xcf; charset=binary'
-    def _detect_Properties_PIL(self):
+    def _detect_Properties(self):
         """Retrieve as much file property info possible, especially the same
            as commons does in order to compare if those libraries (ImageMagick,
            ...) are buggy (thus explicitely use other software for independence)"""
-        #self.image_size = (None, None)
-        result = {'Format': u'-', 'Pages': 0}
-        result = { 'Format': u'%s' % self.image_mime[1].upper() }
+        result =      { 'Format':     u'%s' % self.image_mime[1].upper(),
         # DO NOT use ImageMagick (identify) instead of PIL to get these info !!
+                        'Pages':      0,
+                        'Dimensions': self.image_size,
+                        'Filesize':   os.path.getsize(self.filename),
+                        'MIME':       u'%s/%s' % tuple(self.image_mime[:2]), }
-        result['Dimensions'] = self.image_size
-        result['Filesize']   = os.path.getsize(self.image_path)
-        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+        #self._properties['Properties'] = [result]
+        self._properties['Properties'][0].update(result)
+        return
-        #self._info['Properties'] = [result]
-        self._info['Properties'][0].update(result)
-        return {'Properties': [result]}
-
 class SvgFile(JpegFile):
     def _convert(self):
         # SVG: rasterize the SVG to bitmap (MAY BE GET FROM WIKI BY DOWNLOAD?...)
@@ -2556,11 +2323,11 @@
             self.image_path_JPEG = self.image_path
# MIME: 'application/xml; charset=utf-8'
-    def _detect_Properties_PIL(self):
+    def _detect_Properties(self):
         """Retrieve as much file property info possible, especially the same
            as commons does in order to compare if those libraries (ImageMagick,
            ...) are buggy (thus explicitely use other software for independence)"""
-        #self.image_size = (None, None)
+
         result = {'Format': u'-', 'Pages': 0}
# similar to PDF page count OR use BeautifulSoup
@@ -2584,23 +2351,30 @@
#self.image_size = (svg.props.width, svg.props.height)
-        result = { 'Format':     valid,
-                   'Mode':       u'-',
-                   'Palette':    u'-',
-                   'Pages':      pc, }
+        result.update({ 'Format':     valid,
+                        'Mode':       u'-',
+                        'Palette':    u'-',
+                        'Pages':      pc,
         # may be set {{validSVG}} also or do something in bot template to
         # recognize 'Format=SVG (valid)' ...
+                        'Dimensions': self.image_size,
+                        'Filesize':   os.path.getsize(self.filename),
+                        'MIME':       u'%s/%s' % tuple(self.image_mime[:2]), })
-        result['Dimensions'] = self.image_size
-        result['Filesize']   = os.path.getsize(self.image_path)
-        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+        #self._properties['Properties'] = [result]
+        self._properties['Properties'][0].update(result)
+        return
-        #self._info['Properties'] = [result]
-        self._info['Properties'][0].update(result)
-        return {'Properties': [result]}
-
 class PdfFile(JpegFile):
+    def getFeatures(self):
+        # optical and other text recognition (tesseract & ocropus, ...)
+        self._detect_EmbeddedText_poppler()
+#        self._recognize_OpticalText_ocropus()
+        # (may be just classify as 'contains text', may be store text, e.g. to wikisource)
+
+        return self._features
+
     def _convert(self):
 #        self._wikidata = self.image._latestInfo # all info wikimedia got from content (mime, sha1, ...)
@@ -2613,32 +2387,180 @@
             pass
# MIME: 'application/pdf; charset=binary'
-    def _detect_Properties_PIL(self):
+    def _detect_Properties(self):
         """Retrieve as much file property info possible, especially the same
            as commons does in order to compare if those libraries (ImageMagick,
            ...) are buggy (thus explicitely use other software for independence)"""
-        #self.image_size = (None, None)
-        result = {'Format': u'-', 'Pages': 0}
# http://code.activestate.com/recipes/496837-count-pdf-pages/
         #rxcountpages = re.compile(r"$\s*/Type\s*/Page[/\s]", re.MULTILINE|re.DOTALL)
         rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)    # PDF v. 1.3,1.4,1.5,1.6
         pc = len(rxcountpages.findall( file(self.image_path,"rb").read() ))
-        result = { 'Format':     u'PDF',
-                   'Mode':       u'-',
-                   'Palette':    u'-',
-                   'Pages':      pc, }
+        result =      { 'Format':     u'PDF',
+                        'Mode':       u'-',
+                        'Palette':    u'-',
+                        'Pages':      pc,
+                        'Dimensions': self.image_size,
+                        'Filesize':   os.path.getsize(self.filename),
+                        'MIME':       u'%s/%s' % tuple(self.image_mime[:2]), }
-        result['Dimensions'] = self.image_size
-        result['Filesize']   = os.path.getsize(self.image_path)
-        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+        #self._properties['Properties'] = [result]
+        self._properties['Properties'][0].update(result)
+        return
-        #self._info['Properties'] = [result]
-        self._info['Properties'][0].update(result)
-        return {'Properties': [result]}
+    # ./run-test (ocropus/ocropy)
+    # (in fact all scripts/executables used here are pure python scripts!!!)
+    def _recognize_OpticalText_ocropus(self):
+        # optical text recognition (tesseract & ocropus, ...)
+        # (no full recognition but - at least - just classify as 'contains text')
+        # http://www.claraocr.org/de/ocr/ocr-software/open-source-ocr.html
+        # https://github.com/edsu/ocropy
+        # http://de.wikipedia.org/wiki/Benutzer:DrTrigonBot/Doku#Categorization
+        # Usage:tesseract imagename outputbase [-l lang] [configfile [[+|-]varfile]...]
+        # tesseract imagename.tif output
+        # (it's simpler to run the scripts/executables in own environment/interpreter...)
+        path = os.path.join(scriptdir, 'dtbext/_ocropus/ocropy')
+
+        curdir = os.path.abspath(os.curdir)
+        os.chdir(path)
+
+        # binarization
+        if os.path.exists(os.path.join(path, "temp")):
+            shutil.rmtree(os.path.join(path, "temp"))
+        if os.system("ocropus-nlbin %s -o %s" % (self.image_path_JPEG, os.path.join(path, "temp"))):
+            raise ImportError("ocropus not found!")
+        
+        # page level segmentation
+        if os.system("ocropus-gpageseg --minscale 6.0 '%s'" % os.path.join(path, "temp/????.bin.png")):
+            # detection error
+            return
+        
+        # raw text line recognition
+        if os.system("ocropus-lattices --writebestpath '%s'" % os.path.join(path, "temp/????/??????.bin.png")):
+            # detection error
+            return
+        
+        # language model application
+        # (optional - improve the raw results by applying a pretrained model)
+        os.environ['OCROPUS_DATA'] = os.path.join(path, "models/")
+        if os.system("ocropus-ngraphs '%s'" % os.path.join(path, "temp/????/??????.lattice")):
+            # detection error
+            return
+        
+        # create hOCR output
+        if os.system("ocropus-hocr '%s' -o %s" % (os.path.join(path, "temp/????.bin.png"), os.path.join(path, "temp.html"))):
+            # detection error
+            return
+        
+        ## 'create HTML for debugging (use "firefox temp/index.html" to view)'
+        ## (optional - generate human readable debug output)
+        #if os.system("ocropus-visualize-results %s" % os.path.join(path, "temp")):
+        #    # detection error
+        #    return
+        
+        # "to see recognition results, type: firefox temp.html"
+        # "to see details on the recognition process, type: firefox temp/index.html"
+        tmpfile = open(os.path.join(path, "temp.html"), 'r')
+        data = tmpfile.read()
+        tmpfile.close()
+
+        shutil.rmtree(os.path.join(path, "temp"))
+        os.remove(os.path.join(path, "temp.html"))
+
+        os.chdir(curdir)
+
+        #print data
+        pywikibot.output(data)
+ 
+    def _detect_EmbeddedText_poppler(self):
+        # may be also: http://www.reportlab.com/software/opensource/rl-toolkit/
+
+        # poppler pdftotext/pdfimages
+        # (similar as in '_util_get_DataTags_EXIF' but with stderr and no json output)
+        # http://poppler.freedesktop.org/
+        # http://www.izzycode.com/bash/how-to-install-pdf2text-on-centos-fedora-redhat...
+        # MIGHT BE BETTER TO USE AS PYTHON MODULE:
+        # https://launchpad.net/poppler-python/
+        # http://stackoverflow.com/questions/2732178/extracting-text-from-pdf-with-pop...
+        # http://stackoverflow.com/questions/25665/python-module-for-converting-pdf-to...
+        #proc = Popen("pdftotext -layout %s %s" % (self.image_path, self.image_path+'.txt'), 
+        proc = Popen("pdftotext %s %s" % (self.image_path, self.image_path+'.txt'), 
+                     shell=True, stderr=PIPE)#.stderr.readlines()
+        proc.wait()
+        if proc.returncode:
+            raise ImportError("pdftotext not found!")
+        data = open(self.image_path+'.txt', 'r').readlines()
+        os.remove( self.image_path+'.txt' )
+
+#        self._content_text = data
+        (s1, l1) = (len(u''.join(data)), len(data))
+
+        tmp_path = os.path.join(os.environ.get('TMP', '/tmp'), 'DrTrigonBot/')
+        os.mkdir( tmp_path )
+# switch this part off since 'pdfimages' (on toolserver) is too old; TS-1449
+#        proc = Popen("pdfimages -p %s %s/" % (self.image_path, tmp_path), 
+        proc = Popen("pdfimages %s %s/" % (self.image_path, tmp_path), 
+                     shell=True, stderr=PIPE)#.stderr.readlines()
+        proc.wait()
+        if proc.returncode:
+            raise ImportError("pdfimages not found!")
+        images = os.listdir( tmp_path )
+#        pages  = set()
+        for f in images:
+#            pages.add( int(f.split('-')[1]) )
+            os.remove( os.path.join(tmp_path, f) )
+        os.rmdir( tmp_path )
+        
+        ## pdfminer (tools/pdf2txt.py)
+        ## http://denis.papathanasiou.org/?p=343 (for layout and images)
+        #debug = 0
+        #laparams = layout.LAParams()
+        ##
+        #pdfparser.PDFDocument.debug        = debug
+        #pdfparser.PDFParser.debug          = debug
+        #cmapdb.CMapDB.debug                = debug
+        #pdfinterp.PDFResourceManager.debug = debug
+        #pdfinterp.PDFPageInterpreter.debug = debug
+        #pdfdevice.PDFDevice.debug          = debug
+        ##
+        #rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
+        #outfp = StringIO.StringIO()
+        #device = converter.TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams)
+        ##device = converter.XMLConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams, outdir=None)
+        ##device = converter.HTMLConverter(rsrcmgr, outfp, codec='utf-8', scale=1,
+        ##                       layoutmode='normal', laparams=laparams, outdir=None)
+        ##device = pdfdevice.TagExtractor(rsrcmgr, outfp, codec='utf-8')
+        #fp = file(self.image_path, 'rb')
+        #try:
+        #    pdfinterp.process_pdf(rsrcmgr, device, fp, set(), maxpages=0, password='',
+        #                caching=True, check_extractable=False)
+        #except AssertionError:
+        #    pywikibot.warning(u'pdfminer missed, may be corrupt [_detect_EmbeddedText_poppler]')
+        #    return
+        #except TypeError:
+        #    pywikibot.warning(u'pdfminer missed, may be corrupt [_detect_EmbeddedText_poppler]')
+        #    return
+        #fp.close()
+        #device.close()
+        #data = outfp.getvalue().splitlines(True)
+        #
+        #(s2, l2) = (len(u''.join(data)), len(data))
+
+        result = { 'Size':     s1,
+                   'Lines':    l1,
+                   #'Data':     data,
+                   #'Position': pos,
+#                   'Images':   u'%s (on %s page(s))' % (len(images), len(list(pages))),  # pages containing images
+                   'Images':   u'%s' % len(images),
+                   'Type':     u'-', }  # 'Type' could be u'OCR' above...
+
+        self._features['Text'] = [result]
+        return
+
+
 #class DjvuFile(JpegFile):
 #    pass
@@ -2651,35 +2573,29 @@
         # general audio feature extraction
 #        self._detect_AudioFeatures_YAAFE()
+        return self._features
+
     # MIME: 'application/ogg; charset=binary'
-    def _detect_Properties_PIL(self):
+    def _detect_Properties(self):
         """Retrieve as much file property info possible, especially the same
            as commons does in order to compare if those libraries (ImageMagick,
            ...) are buggy (thus explicitely use other software for independence)"""
-        #self.image_size = (None, None)
-        result = {'Format': u'-', 'Pages': 0}
# 'ffprobe' (ffmpeg); audio and video streams files (ogv, oga, ...)
         d = self._util_get_DataStreams_FFMPEG()
         #print d
-        result['Format'] = u'%s' % d['format']['format_name'].upper()
-        result['Dimensions'] = self.image_size
-        result['Filesize']   = os.path.getsize(self.image_path)
-        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+        result =      { 'Format':     u'%s' % d['format']['format_name'].upper(),
+                        'Pages':      0,
+                        'Dimensions': self.image_size,
+                        'Filesize':   os.path.getsize(self.filename),
+                        'MIME':       u'%s/%s' % tuple(self.image_mime[:2]), }
-        #self._info['Properties'] = [result]
-        self._info['Properties'][0].update(result)
-        return {'Properties': [result]}
+        #self._properties['Properties'] = [result]
+        self._properties['Properties'][0].update(result)
+        return
def _detect_Streams_FFMPEG(self):
-        self._info['Streams'] = []
-
-        # skip file formats that interfere and can cause strange results (pdf is oga?!)
-        # or file formats not supported (yet?)
-        if (self.image_mime[1] in ['pdf']) or (self.image_fileext in [u'.svg']):
-            return
-
         # audio and video streams files (ogv, oga, ...)
         d = self._util_get_DataStreams_FFMPEG()
         if not d:
@@ -2707,10 +2623,44 @@
                             'Dimensions': dim or (None, None),
                             })
-        if 'image' not in d["format"]["format_name"]:
-            self._info['Streams'] = result
+        if 'image' in d["format"]["format_name"]:
+            result = []
+        self._features['Streams'] = result
         return
+    def _util_get_DataStreams_FFMPEG(self):
+        if hasattr(self, '_buffer_FFMPEG'):
+            return self._buffer_FFMPEG
+
+        # (similar as in '_util_get_DataTags_EXIF')
+# switch this part off since 'ffprobe' (on toolserver) is too old; TS-1449
+#        data = Popen("ffprobe -v quiet -print_format json -show_format -show_streams %s" % self.image_path, 
+        proc = Popen("ffprobe -v quiet -show_format -show_streams %s" % self.image_path,#.replace('%', '%%'), 
+                     shell=True, stdout=PIPE)#.stdout.read()
+        proc.wait()
+        if proc.returncode == 127:
+            raise ImportError("ffprobe (ffmpeg) not found!")
+        data = proc.stdout.read().strip()
+#        self._buffer_FFMPEG = json.loads(data)
+        res, key, cur = {}, '', {}
+        for item in data.splitlines():
+            if (item[0] == '['):
+                if not (item[1] == '/'):
+                    key = item[1:-1]
+                    cur = {}
+                    if key not in res:
+                        res[key] = []
+                else:
+                    res[key].append( cur )
+            else:
+                val = item.split('=')
+                cur[val[0].strip()] = val[1].strip()
+        if res:
+            res = { 'streams': res['STREAM'], 'format': res['FORMAT'][0] }
+        self._buffer_FFMPEG = res
+        
+        return self._buffer_FFMPEG
+
     def _detect_AudioFeatures_YAAFE(self):
         # http://yaafe.sourceforge.net/manual/tools.html
         # http://yaafe.sourceforge.net/manual/quickstart.html - yaafe.py
@@ -2763,12 +2713,6 @@
         #         $ export YAAFE_PATH=/home/ursin/Desktop/yaafe-v0.64/src_python/
         #         $ export PYTHONPATH=/home/ursin/Desktop/yaafe-v0.64/src_python
-        # skip file formats not supported (yet?)
-        if (self.image_mime[1] in ['ogg']):#, 'midi']):
-            return
-
-        self._info['Audio'] = []
-
         import yaafelib as yaafe
# use WAV, OGG, MP3 (and others) audio file formats
@@ -2865,15 +2809,14 @@
             os.remove(fn)
             # remove folder too...
-        self._info['Audio'] = [data]
+        self._features['Audio'] = [data]
         return
class MidiFile(UnknownFile):
     def getFeatures(self):
-        result = {}
-        result.update( self._detect_AudioFeatures_MUSIC21() )   # Audio
-        return result
+        self._detect_AudioFeatures_MUSIC21()    # Audio
+        return self._features
def _detect_HeaderAndMetadata(self):
         result = {}
@@ -2895,51 +2838,56 @@
                 result[key].append(ba[i+3:e].decode('latin-1').strip())
             result[key] = u'\n'.join(result[key])
+        ## find specific info in extracted data
+        #print [item.strip() for item in re.findall('Generated .*?\n', result['Text'])]
+        ##u"Cr'eateur: GNU LilyPond 2.0.1"
+        #import dateutil.parser
+        #dates = []
+        #for line in result['Text'].splitlines():
+        #    # http://stackoverflow.com/questions/3276180/extracting-date-from-a-string-in-...
+        #    try:
+        #        dates.append(dateutil.parser.parse(line, fuzzy=True).isoformat(' ').decode('utf-8'))
+        #    except ValueError:
+        #        pass
+        #print dates
+
         import _music21 as music21
         try:
             s = music21.converter.parse(self.filename)
             if s.metadata:
                 pywikibot.output(unicode(s.metadata))
-                result['Metadata'].update(s.metadata)
+                result.update(s.metadata)
         except music21.midi.base.MidiException:
             pass
-        self._info['Metadata'] = [result]
-        return {'Metadata': [result]}
+        self._properties['Metadata'] = [result]
+        return
# MIME: 'audio/midi; charset=binary'
-    def _detect_Properties_PIL(self):
+    def _detect_Properties(self):
         """Retrieve as much file property info possible, especially the same
            as commons does in order to compare if those libraries (ImageMagick,
            ...) are buggy (thus explicitely use other software for independence)"""
-        #self.image_size = (None, None)
-        result = {'Format': u'-', 'Pages': 0}
-        result['Format'] = u'%s' % self.image_mime[1].upper()
+        result =      { 'Format':     u'%s' % self.image_mime[1].upper(),
+                        'Pages':      0,
+                        'Dimensions': self.image_size,
+                        'Filesize':   os.path.getsize(self.filename),
+                        'MIME':       u'%s/%s' % tuple(self.image_mime[:2]), }
-        result['Dimensions'] = self.image_size
-        result['Filesize']   = os.path.getsize(self.image_path)
-        result['MIME']       = u'%s/%s' % tuple(self.image_mime[:2])
+        #self._properties['Properties'] = [result]
+        self._properties['Properties'][0].update(result)
+        return
-        #self._info['Properties'] = [result]
-        self._info['Properties'][0].update(result)
-        return {'Properties': [result]}
-
     # midi audio feature extraction
     def _detect_AudioFeatures_MUSIC21(self):
-        # skip file formats not supported
-        if (self.image_mime[1] not in ['midi']):
-            return
-
         import _music21 as music21
-        #audiofile = '/home/ursin/Desktop/3_Ships.mid'
-        audiofile = self.image_path
-
         #music21.features.jSymbolic.getCompletionStats()
         try:
-            #s = music21.midi.translate.midiFilePathToStream(audiofile)
-            s = music21.converter.parse(audiofile)
+            #audiofile = '/home/ursin/Desktop/3_Ships.mid'
+            #s = music21.midi.translate.midiFilePathToStream(self.filename)
+            s = music21.converter.parse(self.filename)
         except music21.midi.base.MidiException:
             pywikibot.warning(u'unknown file type [_detect_AudioFeatures_MUSIC21]')
             return
@@ -2979,8 +2927,8 @@
         #print s.seconds
         #print s.secondsMap
-        self._info['Audio'] += [data]
-        return {'Audio': [data]}
+        self._features['Audio'] = [data]
+        return
FILETYPES = {                        '*': UnknownFile,
@@ -3218,21 +3166,14 @@
return (u'Graphics', bool(relevance))
-#    # Category:MIDI files created with GNU LilyPond
-#    def _cat_audio_MIDIfilescreatedwithGNULilyPond(self):
-#        # find metadata in extracted data
-#        print [item.strip() for item in re.findall('Generated .*?\n', u'\n'.join(result['Text']))]
-#        #u"Cr'eateur: GNU LilyPond 2.0.1"
-#        import dateutil.parser
-#        dates = []
-#        for line in result['Text']:
-#            # http://stackoverflow.com/questions/3276180/extracting-date-from-a-string-in-...
-#            try:
-#                dates.append(dateutil.parser.parse(line, fuzzy=True).isoformat(' ').decode('utf-8'))
-#            except ValueError:
-#                pass
-#        print dates
+    # Category:MIDI files created with GNU LilyPond
+    def _cat_meta_MIDIfilescreatedwithGNULilyPond(self):
+        result = self._info_filter['Metadata']
+        relevance = (u"Generated automatically by: GNU LilyPond" in
+                     result[0]['Text'])
+        return (u'MIDI files created with GNU LilyPond', bool(relevance))
+
     # Category:Categorized by DrTrigonBot
     def _addcat_BOT(self):
         # - ALWAYS -
@@ -3611,7 +3552,7 @@
def log_output(self):
         # ColorRegions always applies here since there is at least 1 (THE average) color...
-        ignore = ['Properties', 'ColorAverage', 'ColorRegions', 'Geometry']
+        ignore = ['Properties', 'Metadata', 'ColorAverage', 'ColorRegions', 'Geometry']
         #if not self._existInformation(self._info):  # information available?
         # information available? AND/OR category available?
         if not (self._existInformation(self._info, ignore = ignore) or self._result_check):
@@ -3664,14 +3605,13 @@
         return u"\n".join( ret )
def clean_cache(self):
-#        if os.path.exists(self.image_path):
-#            os.remove( self.image_path )
-#        if os.path.exists(self.image_path_JPEG):
-#            os.remove( self.image_path_JPEG )
-#        #image_path_new = self.image_path_JPEG.replace(u"cache/", u"cache/0_DETECTED_")
-#        #if os.path.exists(image_path_new):
-#        #    os.remove( image_path_new )
-        pass
+        if os.path.exists(self.image_path):
+            os.remove( self.image_path )
+        #if os.path.exists(self.image_path_JPEG):
+        #    os.remove( self.image_path_JPEG )
+        ##image_path_new = self.image_path_JPEG.replace(u"cache/", u"cache/0_DETECTED_")
+        ##if os.path.exists(image_path_new):
+        ##    os.remove( image_path_new )
# LOOK ALSO AT: checkimages.CatImagesBot.report
     def report(self):
@@ -3799,40 +3739,19 @@
# gather data from all information interfaces
     def gatherFeatures(self):
-        self._info['Properties'] = [{'Format': u'-', 'Pages': 0}]
-        self._info['Metadata'] = []
-        self._info['ColorAverage'] = []
-        self._info['ColorRegions'] = []
-        self._info['Faces'] = []
-        self._info['OpticalCodes'] = []
-        self._info['People'] = []
-        self._info['Chessboard'] = []
-        self._info['Text'] = []
-        self._info['Streams'] = []
-        self._info['Audio'] = []
-        self._info['Legs'] = []
-        self._info['Hands'] = []
-        self._info['Torsos'] = []
-        self._info['Ears'] = []
-        self._info['Eyes'] = []
-        self._info['Automobiles'] = []
-
         # split detection and extraction according to file types; JpegFile, ...
         TypeFile = FILETYPES.get(tuple(self.image_mime[:2]), FILETYPES['*'])
-        with TypeFile(self.image_path, self.image_mime) as tf:
-            import copy
-            tf.__dict__.update(copy.deepcopy(self.__dict__))
+        with TypeFile(self.image_path) as tf:
+            tf.image_mime = self.image_mime
+            tf.image      = self.image
             for func in ['getProperties', 'getFeatures']:
                 result = getattr(tf, func)()
-                if result:
-                    self._info.update(result)
+                self._info.update(result)
             print self._info
-            print tf._info
             #print tf.__dict__
-            self._info      = tf._info
             self.image_size = tf.image_size
-    def _existInformation(self, info, ignore = ['Properties', 'ColorAverage']):
+    def _existInformation(self, info, ignore = ['Properties', 'Metadata', 'ColorAverage']):
         result = []
         for item in info:
             if item in ignore:
@@ -3846,6 +3765,11 @@
         result = self._info['Properties']
         return {'Properties': result}
+    def _filter_Metadata(self):
+        # >>> never drop <<<
+        result = self._info['Metadata']
+        return {'Metadata': result}
+
     def _filter_Faces(self):
         result = self._info['Faces']
         if (len(result) < self._thrhld_group_size):