[Scipy-svn] r3150 - in trunk/Lib/sandbox/svm: . examples

scipy-svn@scip... scipy-svn@scip...
Mon Jul 9 06:15:26 CDT 2007


Author: cdavid
Date: 2007-07-09 06:15:15 -0500 (Mon, 09 Jul 2007)
New Revision: 3150

Added:
   trunk/Lib/sandbox/svm/examples/
   trunk/Lib/sandbox/svm/examples/classification.py
   trunk/Lib/sandbox/svm/examples/classification2.py
   trunk/Lib/sandbox/svm/examples/utils.py
Log:
Add 2 examples of classification using svm, with CV

Added: trunk/Lib/sandbox/svm/examples/classification.py
===================================================================
--- trunk/Lib/sandbox/svm/examples/classification.py	2007-07-08 07:32:07 UTC (rev 3149)
+++ trunk/Lib/sandbox/svm/examples/classification.py	2007-07-09 11:15:15 UTC (rev 3150)
@@ -0,0 +1,92 @@
+#! /usr/bin/env python
+# Last Change: Mon Jul 09 08:00 PM 2007 J
+
+__doc__ = """Example of doing classification with mixture of Gaussian. Note
+that this is really a toy example: we do not use testing testset nor cross
+validation.
+
+We use the famous iris database used by Sir R.A. Fisher. You can try to change
+the attributes used for classification, number of components used for the
+mixtures, etc..."""
+
+import numpy as N
+import pylab as P
+import matplotlib as MPL
+
+from scipy.sandbox import svm
+import utils
+
+data = utils.iris.load()
+
+def get_data(xattr, yattr, ln):
+    """Given x and y attributes, returns label, samples, label, sample, where
+    the first couple (label, samples) is for training, the other one for
+    testing.
+
+    For each class, the first nl samples are used for training, the other for testing.
+    """
+    lxdata = {}
+    lydata = {}
+    txdata = {}
+    tydata = {}
+    llabel = {}
+    tlabel = {}
+
+    data = utils.iris.load()
+    cnames = data.keys()
+    lnames = {}
+    for i in range(len(cnames)):
+        lnames[cnames[i]] = i
+
+    for i in cnames:
+        lxdata[i] = data[i][xattr][:ln]
+        txdata[i] = data[i][xattr][ln:]
+        lydata[i] = data[i][yattr][:ln]
+        tydata[i] = data[i][yattr][ln:]
+
+    lxdata = N.concatenate([lxdata[i] for i in cnames])
+    lydata = N.concatenate([lydata[i] for i in cnames])
+    txdata = N.concatenate([txdata[i] for i in cnames])
+    tydata = N.concatenate([tydata[i] for i in cnames])
+
+    llabel = N.concatenate([lnames[i] * N.ones(ln, N.int) for i in cnames])
+    tlabel = N.concatenate([lnames[i] * N.ones(ln, N.int) for i in cnames])
+
+    return llabel, N.vstack((lxdata, lydata)).T, tlabel, \
+           N.vstack((txdata, tydata)).T, cnames, lnames
+
+#--------------------
+# Data pre processing
+#--------------------
+# we use 25 samples of each class (eg half of the dataset), for
+# learning, and the other half for testing. We use sepal width and petal width
+# only as features.
+ln = 25
+llabel, ldata, tlabel, tdata, cnames, lnames = get_data('sepal width', 'petal width', ln)
+
+training = svm.LibSvmClassificationDataSet(llabel, ldata)
+testing = svm.LibSvmTestDataSet(tdata)
+
+def train_svm(cost, gamma, fold = 5):
+    """Train a SVM for given cost and gamma."""
+    kernel = svm.RBFKernel(gamma = gamma)
+    model = svm.LibSvmCClassificationModel(kernel, cost = cost)
+    cv = model.cross_validate(training, fold)
+    return cv
+
+c_range = N.exp(N.log(2.) * N.arange(-5, 15))
+g_range = N.exp(N.log(2.) * N.arange(-15, 3))
+
+# Train the svm on a log distributed grid
+gr = N.meshgrid(c_range, g_range)
+c = gr[0].flatten()
+g = gr[1].flatten()
+cf = N.hstack((c, g))
+cv = N.empty(c.size)
+for i in range(cv.size):
+    cv[i] = train_svm(c[i], g[i])
+
+v = P.contour(N.log2(gr[0]), N.log2(gr[1]), cv.reshape(g_range.size, c_range.size), 12)
+P.clabel(v, inline = 1, fontsize = 10)
+P.show()
+P.legend()

Added: trunk/Lib/sandbox/svm/examples/classification2.py
===================================================================
--- trunk/Lib/sandbox/svm/examples/classification2.py	2007-07-08 07:32:07 UTC (rev 3149)
+++ trunk/Lib/sandbox/svm/examples/classification2.py	2007-07-09 11:15:15 UTC (rev 3150)
@@ -0,0 +1,50 @@
+#! /usr/bin/env python
+# Last Change: Mon Jul 09 07:00 PM 2007 J
+
+__doc__ = """Example of doing classification with mixture of Gaussian. Note
+that this is really a toy example: we do not use testing testset nor cross
+validation.
+
+We use the famous iris database used by Sir R.A. Fisher. You can try to change
+the attributes used for classification, number of components used for the
+mixtures, etc..."""
+
+import numpy as N
+import pylab as P
+import matplotlib as MPL
+
+from scipy.sandbox import svm
+import utils
+
+from scikits.learn.datasets import german
+data = german.load()
+
+features = N.vstack([data['feat']['feat' + str(i)].astype(N.float) for i in range(1, 25)]).T
+label = data['label']
+
+t, s = utils.scale(features)
+
+training = svm.LibSvmClassificationDataSet(label, features)
+
+def train_svm(cost, gamma, fold = 5):
+    """Train a SVM for given cost and gamma."""
+    kernel = svm.RBFKernel(gamma = gamma)
+    model = svm.LibSvmCClassificationModel(kernel, cost = cost)
+    cv = model.cross_validate(training, fold)
+    return cv
+
+c_range = N.exp(N.log(2.) * N.arange(-5, 15))
+g_range = N.exp(N.log(2.) * N.arange(-15, 3))
+
+# Train the svm on a log distributed grid
+gr = N.meshgrid(c_range, g_range)
+c = gr[0].flatten()
+g = gr[1].flatten()
+cf = N.hstack((c, g))
+cv = N.empty(c.size)
+for i in range(cv.size):
+    print "=============== iteration %d / %d ============" % (i, cv.size)
+    cv[i] = train_svm(c[i], g[i])
+
+v = P.contour(gr[0], gr[1], cv.reshape(g_range.size, c_range.size), 8)
+P.show()

Added: trunk/Lib/sandbox/svm/examples/utils.py
===================================================================
--- trunk/Lib/sandbox/svm/examples/utils.py	2007-07-08 07:32:07 UTC (rev 3149)
+++ trunk/Lib/sandbox/svm/examples/utils.py	2007-07-09 11:15:15 UTC (rev 3150)
@@ -0,0 +1,64 @@
+#! /usr/bin/env python
+# Last Change: Mon Jul 09 05:00 PM 2007 J
+
+# Various utilities for examples 
+
+import numpy as N
+from numpy.testing import set_package_path, restore_path
+
+from scikits.learn.datasets import oldfaithful, pendigits, iris
+
+def get_faithful():
+    """Return faithful data as a nx2 array, first column being duration, second
+    being waiting time."""
+    # Load faithful data, convert waiting into integer, remove L, M and S data
+    data = oldfaithful.load()
+    tmp1 = []
+    tmp2 = []
+    for i in data:
+        if not (i[0] == 'L' or i[0] == 'M' or i[0] == 'S'):
+            tmp1.append(i[0])
+            tmp2.append(i[1])
+            
+    waiting = N.array([int(i) for i in tmp1], dtype = N.float)
+    duration = N.array([i for i in tmp2], dtype = N.float)
+
+    waiting = waiting[:, N.newaxis]
+    duration = duration[:, N.newaxis]
+
+    return N.concatenate((waiting, duration), 1)
+
+def get_pendigits():
+    """Return faithful data as a nx2 array, first column being duration, second
+    being waiting time."""
+    # Load faithful data, convert waiting into integer, remove L, M and S data
+    data = pendigits.load()
+    return data['training']['x'], data['training']['y']
+
+def scale(data, mode = 'sym'):
+    """Linearly scale data in place such as each col is in the range [0..1].
+
+    Returns the translation factor t and scaling factor s. You can retrieve
+    the original values with data = s * scaled + t."""
+    n = N.min(data, 0)
+    m = N.max(data, 0)
+    if mode == 'sym':
+        t = n + 0.5 * (m - n)
+        s = 0.5 * (m - n)
+    elif mode == 'right':
+        t = n
+        s = m - n
+    else:
+        raise ValueError("Mode %s not recognized" % mode)
+    
+    data -= t
+    data /= s
+    return t, s
+
+if __name__ == '__main__':
+    a = N.random.randn(10, 2)
+    b = a.copy()
+    scale(a)
+    print a
+    scale(b, 'right')
+    print b



More information about the Scipy-svn mailing list