#! /usr/bin/env python
## -*- coding: utf-8 -*-
## (C) 2012: Hans Georg Schaathun <georg@schaathun.net>
"""
This module defines SQLObject classes for the image and feature datasets.
The SQL database tables are defined through the SQLObject definitions.
"""
print "[pysteg.sql.svmodel]"
from sqlobject import *
from . import *
from .queue import *
from .aux import isDuplicateError
import numpy as np
from svm.gridsearch import gridsearch
from svm.svmutil import svm_load_model, svm_save_model, \
svm_train, svm_problem, svm_predict, svm_parameter
import sys
from .scaling import *
class gridOptions(object):
fold = None
crange = ( -5, 15, 2)
grange = ( 3, -15, -2)
verbosity = 3
gnuplot = None
param = None
xargs = "-q"
nr_local_worker = 1
def __init__(self,config=None,**kw):
for k in kw.keys():
if hasattr(self,k):
setattr(self,k,kw[k])
[docs]class SVModel(SQLObject):
"""An SVModel object defines an SVM classifier. It consists of the
following elements:
1. testset: a TestSet object used as training set,
2. fvector: a FeatureVector object identifying the features used
3. scaling: a list of scaling models for each of the features
required in the feature vector.
4. model: the classification model.
5. feature: an associated Feature object representing the classifier
output (classification score)
The classifier is defined by the training set (1), feature vector (2),
and a scaling strategy which is used to calculate (3). The model (4)
is derived from the first three. The feature (5) is a hook for the
classifier output to use it as input feature in a fused classifier.
Unfortunately, the libsvm classifier model relies on the ctypes library
and cannot be pickled. It has to be stored on file and not in the
database.
"""
testset = ForeignKey( "TestSet" )
feature = ForeignKey( "Feature", alternateID=True, cascade=True )
fvector = ForeignKey( "FeatureVector" )
model = None
modfile = StringCol( default=None )
scaling = ForeignKey( "ScaleModel", default=None )
gamma = FloatCol( default=None )
C = FloatCol( default=None )
xrate = FloatCol( default=None )
fold = IntCol( default=None )
performance = SQLMultipleJoin( "SVMPerformance", joinColumn="svmodel_id" )
gridopt = None
def __str__(self):
return ( "<SVModel %s [%s/%s]>"
% (self.feature.key,self.fvector.key,self.testset.name,) )
[docs] def destroy(self,values=False):
"Delete the model from the data base."
f = self.feature
self.destroySelf()
if self.scaling != None:
self.scaling.destroy()
f.destroy()
@classmethod
[docs] def destroyAll(cls):
"Delete all SVM models."
for m in cls.select():
m.destroy()
@classmethod
[docs] def byKey(cls,key):
"Retrieve an SVM model by key (feature key)."
f = Feature.byKey( key )
return cls.byFeatureID( f )
[docs] def getTestSet(self):
"Get the canonical test set for this SVM model."
name = self.testset.name + "_test"
return TestSet.byName( name )
[docs] def getModel( self ):
"Return the SVM model. The type is a libsvm ctypes object."
if self.model == None:
self.loadModel()
return self.model
[docs] def loadModel( self ):
"Load the model from file. The filename is stored in the database."
if self.modfile == None:
raise MissingDataException, "No model file is defined."
print "[loadModel]", self.modfile
self.model = svm_load_model( self._getPathName() )
print self.model
return self.model
[docs] def saveModel( self, filename=None ):
"""Save the model to file. The filename can be specified as
an argument, overriding normal behaviour. Otherwise it
is sought in in the database or constructed from the key
using a standard formula."""
if filename != None: self.modfile = filename
svm_save_model( self._getPathName(), self.model )
def _getPathName( self ):
"""
Get the full path name of the SVM model file.
If no file name has been specified in the record, one
is generated from the key.
"""
if self.modfile == None:
self.modfile = self.feature.key + ".svmodel"
return config.get( "sql", "modeldir" ) + "/" + self.modfile
# Grid search and training
def _getproblem( self ):
"Get a libSVM problem object defining the problem for training."
(l,fv,img) = self.testset.getFeatures( self.getScaleModel() )
print "Labels:", len(l)
print "Feature vectors:", len(fv)
print "Dimension:", len(fv[0])
return svm_problem( l, fv )
def gridOpt( self, **kw ):
self.gridopt = gridOptions( config, **kw )
def _getGridOpt( self ):
# The grid options may have been set in different ways.
# 1. grid options may have been set externally by the gridOpt() method
if self.gridopt != None:
opt = self.gridopt
# 2. Otherwise we need to create a grid options object now.
else:
opt = gridOptions( config )
# 3. fold may have been set separately in the SQL record, in
# which case this overrides the grid options object.
# Note that this is the only option which may be stored in the DB.
if self.fold != None:
opt.fold = self.fold
# 4. If fold is still not set, we select it based on TestSet size.
elif opt.fold == None:
N = self.testset.count()
if N < 1600: opt.fold = 5
elif N < 2400: opt.fold = 4
elif N < 4000: opt.fold = 3
else: opt.fold = 2
return opt
def _gridSearch( self ):
"Perform the grid search with cross-validation."
P = self._getproblem()
opt = self._getGridOpt()
print "[SVModel._gridSearch] got problem"
(db, (c1,g1), (c, g), rate) = gridsearch( P, opt )
self.C = c
self.gamma = g
self.xrate = rate
print "[SVModel._gridSearch] returning", (c,g)
return (c,g)
[docs] def train( self ):
"Train the classifier. Do grid search if necessary."
verbosity = config.getVerbosity( "sql" )
if verbosity > 0:
print "[SVModel.train] Scaling ..."
sys.stdout.flush()
self._scale( verbosity=verbosity )
if verbosity > 0:
print "[SVModel.train] Scaling done"
sys.stdout.flush()
if self.C == None: self._gridSearch()
return self._train( verbosity=verbosity )
def _train( self, verbosity=1 ):
param = svm_parameter( "" )
param.C = self.C
param.gamma = self.gamma
if verbosity > 0:
print "[SVModel.train] Obtaining problem", (param.C,param.gamma)
sys.stdout.flush()
P = self._getproblem()
if verbosity > 0:
print "[SVModel.train] Starting training"
sys.stdout.flush()
self.model = svm_train( P, param )
if verbosity > 0:
print "[SVModel.train] trained"
sys.stdout.flush()
return self.model
# Retrieving and scaling features
def _scale( self, verbosity=1 ):
"""Calculate the scaling model for the feature vector based on
the given training set.
This is exceedingly slow and 40-50% of the activity is on the
SQL server.
"""
if verbosity > 0:
print "[_scale] Creating Scaling rows"
sys.stdout.flush()
self.scaling = ScaleModel.calc( self.fvector.features,
self.testset, verbosity )
if verbosity > 0:
print "[_scale] Returning"
sys.stdout.flush()
return
[docs] def getScaleModel( self ):
"""Get the scaling model from the database. The return value
is a pair (factor,addterm) of lists. The addterm should be
subtracted from the feature vector and then the factor should
be multiplied to get a scale feature vector.
"""
return self.scaling
[docs]def predict( model, img ):
"""Run the SVM classifier given by model on the images img.
The first parameter is an SVModel object. The second argument
may be an Image, a TestSet, or a list of Image objects.
The return value is a list of classification scores (soft information).
Note that if img is a TestSet, accuracy is calculated but not
returned.
"""
verbosity = config.getVerbosity( "sql" )
fv = model.getScaleModel()
if verbosity > 0:
print "[predict]", fv
sys.stdout.flush()
# This should be the key of a TestSet object
if isinstance( img, str ):
img = TestSet.byName( img )
# This should be a single Image object
if hasattr(img,"features"):
print "[predict] one image", img
fv = [ img.getFeatures( fv ) ]
l = [ 0 ]
imlist = [img]
# This should be a TestSet and includes labels
elif hasattr(img,"getFeatures"):
print "[predict] getFeatures", img, "labels included"
(l,fv,imlist) = img.getFeatures( fv )
# This is an iterator over images, and could be a list or an ImageSet
elif hasattr(img,"__iter__"):
print "[predict] iterator"
L = [ (im.getFeatures(fv),im) for im in img ]
fv = [ f for (f,i) in L ]
imlist = [ i for (f,i) in L ]
l = [ None for i in fv ]
# If none of the above is right, we are stuffed
else:
raise TypeError, "Do not know how to handle type %s" % (type(img),)
if verbosity > 0:
print "[predict]", len(fv), len(fv[0])
sys.stdout.flush()
(plabel,acc,score) = svm_predict( l, fv, model.getModel() )
if verbosity > 0:
print "[predict] svm_predict has returned."
sys.stdout.flush()
for s in score:
if len(s) != 1:
print s
raise AssertationError, "The score should have length 1."
score = [ s[0] for s in score ]
for (im,val) in zip(imlist,score):
model.feature.addValue( im(), val )
return zip(imlist,score,plabel,l)
[docs]def newModel( fskey, key, fvkey, tset, fsdesc=None, desc=None, fold=None ):
T = TestSet.byName( tset )
try:
fset = FeatureSet.byKey( fskey )
except:
fset = FeatureSet( key=fskey, matrix=False, symidx=False,
description=fsdesc )
try:
feature = Feature( cat=fset, key=key, description=desc )
except StandardError as e:
if not isDuplicateError(e):
print "[newModel] Unknown error."
raise e
feature = Feature.byKey( key )
if feature.cat != fset:
raise dberrors.IntegrityError, \
"Clash with existing feature. Feature set does not match."
fv = FeatureVector.byKey( fvkey )
try:
mod = SVModel( testset=T, feature=feature, fvector=fv, fold=fold )
except StandardError as e:
if not isDuplicateError(e):
print "[newModel] Unknown error."
raise e
mod = SVModel.byFeatureID( feature )
if mod.testset != T:
raise dberrors.IntegrityError, \
"Clash with existing SVModel. Test set does not match."
if mod.fvector != fv:
raise dberrors.IntegrityError, \
"Clash with existing SVModel. Feature does does not match."
return mod
[docs]def testSVM( training=False, key=None ):
count = 0
for m in SVModel.select():
if m.modfile == None:
print "Model has not been trained"
print m
else:
if key != None:
k = key
elif training:
k = m.testset.name
else:
k = m.testset.name + "_test"
print "[testSVM]", k
T = TestSet.byName( k )
if perfQueue( T, m ) != None: count += 1
print "[testSVM]", count, "new tasks queued"
[docs]def perfQueue( T, m ):
L = SVMPerformance.selectBy( svmodel=m, testset=T )
N = L.count()
if N > 0:
print "[perfQueue] Model has been evaluated.", N
print m
print T
for ev in L: print ev
return None
else:
q = Queue( image=None, entered=datetime.now(), testset=T, svmodel=m )
print "New test queued"
print m
print q
return q
[docs]def newSVM( trainingset, fvlist, fold=None, fsdesc=None ):
newcount = 0
verbosity = config.getVerbosity( "sql" )
if trainingset == None:
for t in TestSet.select():
if t.name[-5:] != "_test":
newcount += newSVM( t, fvlist, fold, fsdesc )
if verbosity > 0:
print "[newSVM] total of", newcount, "new SVM models"
return newcount
if not fvlist:
fvlist = list(FeatureVector.select())
if isinstance( trainingset, SQLObject ):
T = trainingset
trainingset = T.name
else:
T = TestSet.byName( trainingset )
fskey = trainingset + "SVM"
try:
fset = FeatureSet.byKey( fskey )
except:
fset = FeatureSet( key=fskey, matrix=False, symidx=False,
description=fsdesc )
if verbosity > 0:
print "[newSVM]", fset
for fvkey in fvlist:
if isinstance(fvkey,SQLObject):
fv = fvkey
fvkey = fv.key
else:
fv = FeatureVector.byKey( fvkey )
modlist = SVModel.selectBy( fvector=fv, testset=T )
modcount = modlist.count()
modkey = trainingset + "-" + fvkey
desc="SVM model (%s,%s)" % (trainingset,fvkey)
if modcount > 0:
if verbosity > 0:
print modkey, modcount, "existing models"
if verbosity > 1:
for m in modlist: print m
else:
try:
feature = Feature( cat=fset, key=modkey, description=desc )
except StandardError as e:
if not isDuplicateError(e):
print "[newSVM] Unknown error."
raise e
feature = Feature.byKey( modkey )
if feature.cat != fset:
raise dberrors.IntegrityError, \
"Clash with existing feature. Feature set does not match."
mod = SVModel( testset=T, feature=feature, fvector=fv, fold=fold )
q = Queue( image=None, entered=datetime.now(), svmodel=mod )
if verbosity > 1:
print "new", mod
print q
newcount += 1
if verbosity > 0:
print "[newSVM]", newcount, "new SVM models"
return newcount