Source code for pysteg.sql.svmodel

#! /usr/bin/env python
## -*- coding: utf-8 -*-
## (C) 2012: Hans Georg Schaathun <georg@schaathun.net> 

"""
This module defines SQLObject classes for the image and feature datasets.
The SQL database tables are defined through the SQLObject definitions.
"""

print "[pysteg.sql.svmodel]"

from sqlobject import *
from . import *
from .queue import *
from .aux import isDuplicateError
import numpy as np
from svm.gridsearch import gridsearch
from svm.svmutil import svm_load_model, svm_save_model, \
      svm_train, svm_problem, svm_predict, svm_parameter
import sys

from .scaling import *

class gridOptions(object):
   fold = None
   crange = ( -5,  15,  2)
   grange = (  3, -15, -2)
   verbosity = 3
   gnuplot = None
   param = None
   xargs = "-q"
   nr_local_worker = 1

   def __init__(self,config=None,**kw):
      for k in kw.keys():
	 if hasattr(self,k):
	    setattr(self,k,kw[k])

[docs]class SVModel(SQLObject): """An SVModel object defines an SVM classifier. It consists of the following elements: 1. testset: a TestSet object used as training set, 2. fvector: a FeatureVector object identifying the features used 3. scaling: a list of scaling models for each of the features required in the feature vector. 4. model: the classification model. 5. feature: an associated Feature object representing the classifier output (classification score) The classifier is defined by the training set (1), feature vector (2), and a scaling strategy which is used to calculate (3). The model (4) is derived from the first three. The feature (5) is a hook for the classifier output to use it as input feature in a fused classifier. Unfortunately, the libsvm classifier model relies on the ctypes library and cannot be pickled. It has to be stored on file and not in the database. """ testset = ForeignKey( "TestSet" ) feature = ForeignKey( "Feature", alternateID=True, cascade=True ) fvector = ForeignKey( "FeatureVector" ) model = None modfile = StringCol( default=None ) scaling = ForeignKey( "ScaleModel", default=None ) gamma = FloatCol( default=None ) C = FloatCol( default=None ) xrate = FloatCol( default=None ) fold = IntCol( default=None ) performance = SQLMultipleJoin( "SVMPerformance", joinColumn="svmodel_id" ) gridopt = None def __str__(self): return ( "<SVModel %s [%s/%s]>" % (self.feature.key,self.fvector.key,self.testset.name,) )
[docs] def destroy(self,values=False): "Delete the model from the data base." f = self.feature self.destroySelf() if self.scaling != None: self.scaling.destroy() f.destroy()
@classmethod
[docs] def destroyAll(cls): "Delete all SVM models." for m in cls.select(): m.destroy()
@classmethod
[docs] def byKey(cls,key): "Retrieve an SVM model by key (feature key)." f = Feature.byKey( key ) return cls.byFeatureID( f )
[docs] def getTestSet(self): "Get the canonical test set for this SVM model." name = self.testset.name + "_test" return TestSet.byName( name )
[docs] def getPerformance(self,training=False,prune=False): """Get the canonical SVMPerformance objects for this model. If training is True, then the performance on the training set is returned.""" if training: T = self.testset else: T = self.getTestSet() P = self.performance.filter( SVMPerformance.q.testset == T ) N = P.count() print N, "performance objects" if N == 0: return None else: return P[0] # Model management
[docs] def getModel( self ): "Return the SVM model. The type is a libsvm ctypes object." if self.model == None: self.loadModel() return self.model
[docs] def loadModel( self ): "Load the model from file. The filename is stored in the database." if self.modfile == None: raise MissingDataException, "No model file is defined." print "[loadModel]", self.modfile self.model = svm_load_model( self._getPathName() ) print self.model return self.model
[docs] def saveModel( self, filename=None ): """Save the model to file. The filename can be specified as an argument, overriding normal behaviour. Otherwise it is sought in in the database or constructed from the key using a standard formula.""" if filename != None: self.modfile = filename svm_save_model( self._getPathName(), self.model )
def _getPathName( self ): """ Get the full path name of the SVM model file. If no file name has been specified in the record, one is generated from the key. """ if self.modfile == None: self.modfile = self.feature.key + ".svmodel" return config.get( "sql", "modeldir" ) + "/" + self.modfile # Grid search and training def _getproblem( self ): "Get a libSVM problem object defining the problem for training." (l,fv,img) = self.testset.getFeatures( self.getScaleModel() ) print "Labels:", len(l) print "Feature vectors:", len(fv) print "Dimension:", len(fv[0]) return svm_problem( l, fv ) def gridOpt( self, **kw ): self.gridopt = gridOptions( config, **kw ) def _getGridOpt( self ): # The grid options may have been set in different ways. # 1. grid options may have been set externally by the gridOpt() method if self.gridopt != None: opt = self.gridopt # 2. Otherwise we need to create a grid options object now. else: opt = gridOptions( config ) # 3. fold may have been set separately in the SQL record, in # which case this overrides the grid options object. # Note that this is the only option which may be stored in the DB. if self.fold != None: opt.fold = self.fold # 4. If fold is still not set, we select it based on TestSet size. elif opt.fold == None: N = self.testset.count() if N < 1600: opt.fold = 5 elif N < 2400: opt.fold = 4 elif N < 4000: opt.fold = 3 else: opt.fold = 2 return opt def _gridSearch( self ): "Perform the grid search with cross-validation." P = self._getproblem() opt = self._getGridOpt() print "[SVModel._gridSearch] got problem" (db, (c1,g1), (c, g), rate) = gridsearch( P, opt ) self.C = c self.gamma = g self.xrate = rate print "[SVModel._gridSearch] returning", (c,g) return (c,g)
[docs] def train( self ): "Train the classifier. Do grid search if necessary." verbosity = config.getVerbosity( "sql" ) if verbosity > 0: print "[SVModel.train] Scaling ..." sys.stdout.flush() self._scale( verbosity=verbosity ) if verbosity > 0: print "[SVModel.train] Scaling done" sys.stdout.flush() if self.C == None: self._gridSearch() return self._train( verbosity=verbosity )
def _train( self, verbosity=1 ): param = svm_parameter( "" ) param.C = self.C param.gamma = self.gamma if verbosity > 0: print "[SVModel.train] Obtaining problem", (param.C,param.gamma) sys.stdout.flush() P = self._getproblem() if verbosity > 0: print "[SVModel.train] Starting training" sys.stdout.flush() self.model = svm_train( P, param ) if verbosity > 0: print "[SVModel.train] trained" sys.stdout.flush() return self.model # Retrieving and scaling features def _scale( self, verbosity=1 ): """Calculate the scaling model for the feature vector based on the given training set. This is exceedingly slow and 40-50% of the activity is on the SQL server. """ if verbosity > 0: print "[_scale] Creating Scaling rows" sys.stdout.flush() self.scaling = ScaleModel.calc( self.fvector.features, self.testset, verbosity ) if verbosity > 0: print "[_scale] Returning" sys.stdout.flush() return
[docs] def getScaleModel( self ): """Get the scaling model from the database. The return value is a pair (factor,addterm) of lists. The addterm should be subtracted from the feature vector and then the factor should be multiplied to get a scale feature vector. """ return self.scaling
[docs]def predict( model, img ): """Run the SVM classifier given by model on the images img. The first parameter is an SVModel object. The second argument may be an Image, a TestSet, or a list of Image objects. The return value is a list of classification scores (soft information). Note that if img is a TestSet, accuracy is calculated but not returned. """ verbosity = config.getVerbosity( "sql" ) fv = model.getScaleModel() if verbosity > 0: print "[predict]", fv sys.stdout.flush() # This should be the key of a TestSet object if isinstance( img, str ): img = TestSet.byName( img ) # This should be a single Image object if hasattr(img,"features"): print "[predict] one image", img fv = [ img.getFeatures( fv ) ] l = [ 0 ] imlist = [img] # This should be a TestSet and includes labels elif hasattr(img,"getFeatures"): print "[predict] getFeatures", img, "labels included" (l,fv,imlist) = img.getFeatures( fv ) # This is an iterator over images, and could be a list or an ImageSet elif hasattr(img,"__iter__"): print "[predict] iterator" L = [ (im.getFeatures(fv),im) for im in img ] fv = [ f for (f,i) in L ] imlist = [ i for (f,i) in L ] l = [ None for i in fv ] # If none of the above is right, we are stuffed else: raise TypeError, "Do not know how to handle type %s" % (type(img),) if verbosity > 0: print "[predict]", len(fv), len(fv[0]) sys.stdout.flush() (plabel,acc,score) = svm_predict( l, fv, model.getModel() ) if verbosity > 0: print "[predict] svm_predict has returned." sys.stdout.flush() for s in score: if len(s) != 1: print s raise AssertationError, "The score should have length 1." score = [ s[0] for s in score ] for (im,val) in zip(imlist,score): model.feature.addValue( im(), val ) return zip(imlist,score,plabel,l)
[docs]class SVMPerformance(SQLObject): """Class to record performance statistics for a particular SVM model on a particular test set.""" svmodel = ForeignKey( "SVModel", cascade=True ) testset = ForeignKey( "TestSet" ) idx = DatabaseIndex( 'svmodel', 'testset', unique=True ) description = StringCol( default=None ) FP = FloatCol( default=None ) FN = FloatCol( default=None ) accuracy = FloatCol( default=None ) def destroy(self): return self.destroySelf() def __str__(self): key = self.svmodel.feature.key tset = self.testset.name return "<SVMPerformance %s model=%s testset=%s FP=%s FN=%s>" % ( self.id, key, tset, self.FP,self.FN )
[docs] def display(self): key = self.svmodel.feature.key tset = self.testset.name print "FP=%s, FN=%s (%s) %s/%s" % ( self.FP,self.FN,self.accuracy, key,tset)
[docs] def run(self): L = predict( self.svmodel, self.testset ) R = [ (p,l) for (x,y,p,l) in L ] N = len(R) FP = R.count( (1,0) ) TP = R.count( (1,1) ) FN = R.count( (0,1) ) TN = R.count( (0,0) ) print "FP=%s - FN=%s - TP=%s - TN=%s" % (FP,FN,TP,TN) if FP+TN == 0: self.FP = 0 else: self.FP = float(FP) / (FP+TN) if FN+TP == 0: self.FN = 0 else: self.FN = float(FN) / (FN+TP) self.accuracy = float(TP+TN) / N self.list = L return
@classmethod def getRows(cls): return ( x.getRow() for x in cls.select( orderBy="svmodel_id" ) ) def getRow(self): return { "Feature Vector" : self.svmodel.fvector.key, "TrainingSet" : self.svmodel.testset.name, "Test Set" : self.testset.name, "FP" : self.FP, "FN" : self.FN, "Accuracy" : self.accuracy, }
[docs]def newModel( fskey, key, fvkey, tset, fsdesc=None, desc=None, fold=None ): T = TestSet.byName( tset ) try: fset = FeatureSet.byKey( fskey ) except: fset = FeatureSet( key=fskey, matrix=False, symidx=False, description=fsdesc ) try: feature = Feature( cat=fset, key=key, description=desc ) except StandardError as e: if not isDuplicateError(e): print "[newModel] Unknown error." raise e feature = Feature.byKey( key ) if feature.cat != fset: raise dberrors.IntegrityError, \ "Clash with existing feature. Feature set does not match." fv = FeatureVector.byKey( fvkey ) try: mod = SVModel( testset=T, feature=feature, fvector=fv, fold=fold ) except StandardError as e: if not isDuplicateError(e): print "[newModel] Unknown error." raise e mod = SVModel.byFeatureID( feature ) if mod.testset != T: raise dberrors.IntegrityError, \ "Clash with existing SVModel. Test set does not match." if mod.fvector != fv: raise dberrors.IntegrityError, \ "Clash with existing SVModel. Feature does does not match." return mod
[docs]def testSVM( training=False, key=None ): count = 0 for m in SVModel.select(): if m.modfile == None: print "Model has not been trained" print m else: if key != None: k = key elif training: k = m.testset.name else: k = m.testset.name + "_test" print "[testSVM]", k T = TestSet.byName( k ) if perfQueue( T, m ) != None: count += 1 print "[testSVM]", count, "new tasks queued"
[docs]def perfQueue( T, m ): L = SVMPerformance.selectBy( svmodel=m, testset=T ) N = L.count() if N > 0: print "[perfQueue] Model has been evaluated.", N print m print T for ev in L: print ev return None else: q = Queue( image=None, entered=datetime.now(), testset=T, svmodel=m ) print "New test queued" print m print q return q
[docs]def newSVM( trainingset, fvlist, fold=None, fsdesc=None ): newcount = 0 verbosity = config.getVerbosity( "sql" ) if trainingset == None: for t in TestSet.select(): if t.name[-5:] != "_test": newcount += newSVM( t, fvlist, fold, fsdesc ) if verbosity > 0: print "[newSVM] total of", newcount, "new SVM models" return newcount if not fvlist: fvlist = list(FeatureVector.select()) if isinstance( trainingset, SQLObject ): T = trainingset trainingset = T.name else: T = TestSet.byName( trainingset ) fskey = trainingset + "SVM" try: fset = FeatureSet.byKey( fskey ) except: fset = FeatureSet( key=fskey, matrix=False, symidx=False, description=fsdesc ) if verbosity > 0: print "[newSVM]", fset for fvkey in fvlist: if isinstance(fvkey,SQLObject): fv = fvkey fvkey = fv.key else: fv = FeatureVector.byKey( fvkey ) modlist = SVModel.selectBy( fvector=fv, testset=T ) modcount = modlist.count() modkey = trainingset + "-" + fvkey desc="SVM model (%s,%s)" % (trainingset,fvkey) if modcount > 0: if verbosity > 0: print modkey, modcount, "existing models" if verbosity > 1: for m in modlist: print m else: try: feature = Feature( cat=fset, key=modkey, description=desc ) except StandardError as e: if not isDuplicateError(e): print "[newSVM] Unknown error." raise e feature = Feature.byKey( modkey ) if feature.cat != fset: raise dberrors.IntegrityError, \ "Clash with existing feature. Feature set does not match." mod = SVModel( testset=T, feature=feature, fvector=fv, fold=fold ) q = Queue( image=None, entered=datetime.now(), svmodel=mod ) if verbosity > 1: print "new", mod print q newcount += 1 if verbosity > 0: print "[newSVM]", newcount, "new SVM models" return newcount