# (C) 2012: Hans Georg Schaathun <georg@schaathun.net>
"""
The main feature of this module is the cStat() function which plots
bar charts of accuracy and/or FP/FN rates for subgroups of the
test set divided according to some given feature. The charts make
a basis for assessing the feature as a cover selection heuristic.
There is also an under-documented iStat() function which is used
to check cover selections created as an intersection of two or
more existing selections.
The other methods auxiliaries, but may be useful for variations over
the theme.
"""
__all__ = [ "cStat", "iStat", "mcStat" ]
from sqlobject.sqlbuilder import *
import matplotlib.pyplot as plt
import scipy
from . import *
def sign(x): return int(scipy.sign(x))
def csList(testset,feature,score,reverse=False):
"""
Take a TestSet, a feature to use as cover selection heuristic,
and a classification score (feature). Return a list of triplets
(feature value,classification score, true label) for all
the images in the TestSet. The list sorted by the feature
value.
"""
if not isinstance(testset,SQLObject):
testset = TestSet.byName(testset)
if isinstance(score,list):
L = [ (im.getCoverFeature(feature),
[im.getOneFeature(s) for s in score], im.label )
for im in testset ]
else:
L = [ (im.getCoverFeature(feature), im.getOneFeature(score), im.label )
for im in testset ]
L.sort(lambda x,y : sign(x[0]-y[0]),reverse=reverse)
return L
def cskList(ls,reverse=False):
"""
Take a list of pairs (k,l), sort it by l and return a list
of all the k values.
"""
L = list(ls)
L.sort(lambda x,y : sign(x[1]-y[1]),reverse=reverse)
return [ k for (k,i) in L ]
def csInt(testset,fl,score,N=1500):
if not isinstance(testset,SQLObject):
testset = TestSet.byName(testset)
D = [ ([(im,im.getCoverFeature(f)) for im in testset ], r )
for (f,r) in fl ]
# for (d,r) in D: d.sort(lambda x,y : sign(x[1]-y[1]),reverse=r)
L = [ cskList(d,r)[:N] for (d,r) in D ]
S = reduce( set.__and__, map( set, L ) )
return [ (None,im.getOneFeature(score),im.label) for im in S ]
[docs]def iStat( *a, **kw ):
"""Test intersection of multiple cover selections."""
print "[iStat]"
L = csInt( *a, **kw )
(x,y,z) = estat(L)
return (len(L),x,y,z)
def predict(score):
"Make a hard decision prediction based on the given classification score."
if score > 0: return 0
else: return 1
def pl(L):
return [ (predict(y),z) for (x,y,z) in L ]
def estat(L):
R = pl(L)
FP = float(R.count((1,0)))
TP = float(R.count((1,1)))
FN = float(R.count((0,1)))
TN = float(R.count((0,0)))
N = len(R)
return ( (TP+TN)/N, FP/(FP+TN), FN/(FN+TP) )
def split(L,bins=10):
N = len(L)
m = N / bins
return [ L[i*m:(i+1)*m] for i in range(bins-1) ] + [ L[(bins-1)*m:] ]
def getEstat(L,bins=10):
return map( estat, split(L,bins) )
[docs]def mcStat( testset, feature, score, bins=10, reverse=False,
aplot="/tmp/test.pdf" ):
"""
Make a bar chart of accuracies for different groups of covers.
The covers are divided into bins bins according to the cover
heuristics feature. The accuracy is plotted for each of the
classifier scores in the list score. The plot is saved in the
file aplot.
"""
print "[mcStat]", testset
L = csList(testset,feature,score,reverse=reverse)
N = len(score)
L = [ [ (x,y[i],z) for (x,y,z) in L ] for i in range(N) ]
rl = [ getEstat(l,bins) for l in L ]
plt.figure()
plt.hold(True)
width = 0.8/N
c = 0
col = [ "k", "b", "r", "g", "y", "c", "m" ]
for R in rl:
X = [ x for (x,y,z) in R ]
t = [ i-0.4+width*c for i in range(len(X)) ]
plt.bar( t, X, width=width,color=col[c],label=score[c] )
c += 1
plt.legend( loc="best" )
plt.savefig(aplot)
return R
[docs]def cStat( testset, feature, score, bins=10, reverse=False,
aplot=None, eplot=None ):
"""
Make bar charts of accuracy and error rates of the classification
score score for different groups of covers. The covers are divided
into bins bins according to the cover heuristics feature.
Error rates are plotted on the file eplot and accuracies on aplot.
"""
print "[cStat]", testset
L = csList(testset,feature,score,reverse=reverse)
R = getEstat(L,bins)
if aplot != None:
X = [ x for (x,y,z) in R ]
t = [ i-0.4 for i in range(len(X)) ]
print max(X), min(X)
plt.figure()
plt.bar( t, X )
plt.savefig(aplot)
if eplot != None:
Y = [ y for (x,y,z) in R ]
Z = [ z for (x,y,z) in R ]
print max(Y), min(Y)
print max(Z), min(Z)
plt.figure()
plt.hold(True)
b1 = plt.bar( [ i-0.4 for i in range(len(Y)) ], Y, color="b", width=0.4 )
b2 = plt.bar( range(len(Y)), Z, color="r", width=0.4 )
plt.legend((b1[0],b2[0]),("FP-rate","FN-rate"),loc="best" )
plt.savefig(eplot)
return R