[Scipy-svn] r2291 - in trunk/Lib/sandbox/pyem: . tests

scipy-svn at scipy.org scipy-svn at scipy.org
Thu Oct 19 23:01:19 CDT 2006


Author: cdavid
Date: 2006-10-19 23:01:05 -0500 (Thu, 19 Oct 2006)
New Revision: 2291

Added:
   trunk/Lib/sandbox/pyem/tests/generate_test_data.py
   trunk/Lib/sandbox/pyem/tests/test_gmm_em.py
   trunk/Lib/sandbox/pyem/tests/test_online_em.py
Modified:
   trunk/Lib/sandbox/pyem/Changelog
   trunk/Lib/sandbox/pyem/TODO
   trunk/Lib/sandbox/pyem/__init__.py
   trunk/Lib/sandbox/pyem/_c_densities.py
   trunk/Lib/sandbox/pyem/example.py
   trunk/Lib/sandbox/pyem/info.py
   trunk/Lib/sandbox/pyem/online_em.py
   trunk/Lib/sandbox/pyem/setup.py
   trunk/Lib/sandbox/pyem/tests/test_densities.py
Log:
 * Convert the online_em.py script to a class, OnGMM
 * Add tests for OnGMM
 * Small bugfixes in _c_densitites (convert general exception 
 to ImportException when wrong ctypes version found), setup 
 (remove the name attribute to avoid warning when installing)



Modified: trunk/Lib/sandbox/pyem/Changelog
===================================================================
--- trunk/Lib/sandbox/pyem/Changelog	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/Changelog	2006-10-20 04:01:05 UTC (rev 2291)
@@ -1,3 +1,12 @@
+pyem (0.5.4) Fri, 20 Oct 2006 12:52:01 +0900
+
+	* Bump to 0.5.4
+	* Online EM is basically implemented, with tests. The API
+	should be fixed to be choherent (lacks the Trainer Class, too).
+	The class is not imported by default, still (available as _OnGMM)
+
+-- David Cournapeau <david at ar.media.kyoto-u.ac.jp> 
+
 pyem (0.5.3) Thu, 12 Oct 2006 21:08:21 +0900
 
 	* Change the layout and setup.py for inclusion to scipy.

Modified: trunk/Lib/sandbox/pyem/TODO
===================================================================
--- trunk/Lib/sandbox/pyem/TODO	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/TODO	2006-10-20 04:01:05 UTC (rev 2291)
@@ -1,19 +1,12 @@
-# Last Change: Thu Oct 05 03:00 PM 2006 J
+# Last Change: Fri Oct 20 12:00 PM 2006 J
 
 Things which must be implemented for a 1.0 version (in importante order)
-    - test for various length and model size 
+    - A class for learning + a classifier
+    - test for various length and model size of gaussian densities/GM and GMM
     - a small help/tutorial
     - be complient with scipy module dev guidelines (DEVELOPERS.TXT)
-    - On-line EM
-    - use scipy.cluster.vq instead of custom kmeans algo (find
-    a way to get label from vq.kmeans: depends on scipy changes)
-    - C implementation of Gaussian densities at least (partially done,
-    but need integration into distutils + portable way of detecting/
-    loading shared library with ctypes)
-    - numpy computations make huge difference between C and F storage: 
-    should review the code to see if the
 
-Things which would be nice:
+Things which would be nice (after 1.0 version):
     - Integrate libem (libem should be modified so
     that it would be easy to package and distribute)
     - Other initialization schemes

Modified: trunk/Lib/sandbox/pyem/__init__.py
===================================================================
--- trunk/Lib/sandbox/pyem/__init__.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/__init__.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -1,10 +1,14 @@
 #! /usr/bin/env python
-# Last Change: Thu Oct 12 11:00 PM 2006 J
+# Last Change: Fri Oct 20 11:00 AM 2006 J
 
 from info import __doc__
 
+from gauss_mix import GmParamError, GM
+from gmm_em import GmmParamError, GMM, EM
+from online_em import OnGMM as _OnGMM
+
+__all__ = filter(lambda s:not s.startswith('_'),dir())
+
 from numpy.testing import NumpyTest
 test = NumpyTest().test
 
-from gauss_mix import GmParamError, GM
-from gmm_em import GmmParamError, GMM, EM

Modified: trunk/Lib/sandbox/pyem/_c_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/_c_densities.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/_c_densities.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -1,7 +1,7 @@
 #! /usr/bin/python
 #
 # Copyrighted David Cournapeau
-# Last Change: Tue Oct 03 05:00 PM 2006 J
+# Last Change: Thu Oct 19 06:00 PM 2006 J
 
 # This module uses a C implementation through ctypes, for diagonal cases
 # TODO:
@@ -22,7 +22,7 @@
 if ctypes_major < 1:
     msg =  "version of ctypes is %s, expected at least %s" \
             % (ctypes.__version__, '1.0.0')
-    raise Exception(msg)
+    raise ImportError(msg)
 
 # Requirements for diag gden
 _gden   = load_library('c_gden.so', __file__)

Modified: trunk/Lib/sandbox/pyem/example.py
===================================================================
--- trunk/Lib/sandbox/pyem/example.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/example.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -49,7 +49,7 @@
 # Keep a copy for drawing later
 gm0 = copy.copy(lgm)
 
-# The actual EM, with likelihood computation. The treshold
+# The actual EM, with likelihood computation. The threshold
 # is compared to the (linearly appromixated) derivative of the likelihood
 em      = EM()
 like    = em.train(data, gmm, maxiter = 30, thresh = 1e-8)
@@ -60,6 +60,8 @@
 import pylab as P
 P.subplot(2, 1, 1)
 
+# Level is the confidence level for confidence ellipsoids: 1.0 means that
+# all points will be (almost surely) inside the ellipsoid
 level   = 0.8
 if not d == 1:
     P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')

Modified: trunk/Lib/sandbox/pyem/info.py
===================================================================
--- trunk/Lib/sandbox/pyem/info.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/info.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -30,7 +30,7 @@
 Copyright: David Cournapeau 2006
 License: BSD-style (see LICENSE.txt in main source directory)
 """
-version = '0.5.3'
+version = '0.5.4'
 
 depends = ['linalg', 'stats']
 ignore  = False

Modified: trunk/Lib/sandbox/pyem/online_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/online_em.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/online_em.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -1,5 +1,5 @@
 # /usr/bin/python
-# Last Change: Mon Oct 16 03:00 PM 2006 J
+# Last Change: Fri Oct 20 12:00 PM 2006 J
 
 #---------------------------------------------
 # This is not meant to be used yet !!!! I am 
@@ -10,6 +10,13 @@
 #   - we do not have all the data before putting them in online EM:
 #   eg current frame depends on previous frame in some way.
 
+# TODO:
+#   - Add biblio
+#   - Look back at articles for discussion for init, regularization and 
+#   convergence rates
+#   - the function sufficient_statistics does not really return SS. This is not a
+#   big problem, but it would be better to really return them as the name implied.
+
 import numpy as N
 from numpy import mean
 from numpy.testing import assert_array_almost_equal, assert_array_equal
@@ -18,12 +25,12 @@
 from gauss_mix import GM
 from kmean import kmean
 
-print "======================================================"
-import traceback
-f = traceback.extract_stack()
-print f[0][0] + " This is beta code, don't use it !        "
-print "======================================================"
+import copy
+from numpy.random import seed
 
+# Clamp
+clamp   = 1e-8
+
 # Error classes
 class OnGmmError(Exception):
     """Base class for exceptions in this module."""
@@ -43,11 +50,15 @@
         return self.message
 
 class OnGMM(ExpMixtureModel):
-    def init_kmean(self, init_data, niter = 5):
+    """A Class for 'online' (ie recursive) EM. Insteand
+    of running the E step on the whole data, the sufficient statistics
+    are updated for each new frame of data, and used in the (unchanged)
+    M step"""
+    def init_random(self, init_data):
         """ Init the model at random."""
         k   = self.gm.k
         d   = self.gm.d
-        if mode == 'diag':
+        if self.gm.mode == 'diag':
             w           = N.ones(k) / k
 
             # Init the internal state of EM
@@ -66,14 +77,56 @@
                     mode %s yet""", mode)
 
         self.gm.set_param(w, mu, va)
-        self.cw     = w
-        self.cmu    = mu
-        self.cva    = va
+        # c* are the parameters which are computed at every step (ie
+        # when a new frame is taken into account
+        self.cw     = self.gm.w
+        self.cmu    = self.gm.mu
+        self.cva    = self.gm.va
 
-        self.pw     = self.cw.copy()
-        self.pmu    = self.cmu.copy()
-        self.pva    = self.cva.copy()
+        # p* are the parameters used when computing gaussian densities
+        # they are always the same than c* in the online case
+        self.pw     = self.cw
+        self.pmu    = self.cmu
+        self.pva    = self.cva
 
+    def init_kmean(self, init_data, niter = 5):
+        """ Init the model using kmean."""
+        k   = self.gm.k
+        d   = self.gm.d
+        if self.gm.mode == 'diag':
+            w           = N.ones(k) / k
+
+            # Init the internal state of EM
+            self.cx     = N.outer(w, mean(init_data, 0))
+            self.cxx    = N.outer(w, mean(init_data ** 2, 0))
+
+            # w, mu and va init is the same that in the standard case
+            (code, label)   = kmean(init_data, init_data[0:k, :], niter)
+            mu          = code.copy()
+            va          = N.zeros((k, d))
+            for i in range(k):
+                for j in range(d):
+                    va [i,j] = N.cov(init_data[N.where(label==i), j], rowvar = 0)
+        else:
+            raise OnGmmParamError("""init_online not implemented for
+                    mode %s yet""", mode)
+
+        self.gm.set_param(w, mu, va)
+        # c* are the parameters which are computed at every step (ie
+        # when a new frame is taken into account
+        self.cw     = self.gm.w
+        self.cmu    = self.gm.mu
+        self.cva    = self.gm.va
+
+        # p* are the parameters used when computing gaussian densities
+        # they are the same than c* in the online case
+        # self.pw     = self.cw.copy()
+        # self.pmu    = self.cmu.copy()
+        # self.pva    = self.cva.copy()
+        self.pw     = self.cw
+        self.pmu    = self.cmu
+        self.pva    = self.cva
+
     def __init__(self, gm, init_data, init = 'kmean'):
         self.gm = gm
         
@@ -90,7 +143,9 @@
         gamma   *= self.pw
         gamma   /= N.sum(gamma)
         # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
-        self.cw	= (1 - nu) * self.cw + nu * gamma
+        #self.cw	= (1 - nu) * self.cw + nu * gamma
+        self.cw	*= (1 - nu)
+        self.cw += nu * gamma
 
         return gamma
 
@@ -102,264 +157,117 @@
             self.cmu[k]  = self.cx[k] / self.cw[k]
             self.cva[k]  = self.cxx[k] / self.cw[k] - self.cmu[k] ** 2
     
-if __name__ == "__main__":
-    import copy
-    #=============================
-    # Simple GMM with 2 components
-    #=============================
-
-    #+++++++++++++++++++++++++++++
-    # Meta parameters of the model
-    #   - k: Number of components
-    #   - d: dimension of each Gaussian
-    #   - mode: Mode of covariance matrix: full or diag
-    #   - nframes: number of frames (frame = one data point = one
-    #   row of d elements
-    k       = 2 
+if __name__ == '__main__':
     d       = 1
+    k       = 2
     mode    = 'diag'
     nframes = int(1e3)
-    emiter  = 10
+    emiter  = 4
+    seed(5)
 
-    #+++++++++++++++++++++++++++++++++++++++++++
-    # Create an artificial GMM model, samples it
-    #+++++++++++++++++++++++++++++++++++++++++++
-    print "Generating the mixture"
+    #+++++++++++++++++++++++++++++++++++++++++++++++++
     # Generate a model with k components, d dimensions
+    #+++++++++++++++++++++++++++++++++++++++++++++++++
     w, mu, va   = GM.gen_param(d, k, mode, spread = 1.5)
     gm          = GM.fromvalues(w, mu, va)
-
     # Sample nframes frames  from the model
-    data    = gm.sample(nframes, )
+    data        = gm.sample(nframes)
 
-    #++++++++++++++++++++++++
-    # Learn the model with EM
-    #++++++++++++++++++++++++
-
+    #++++++++++++++++++++++++++++++++++++++++++
+    # Approximate the models with classical EM
+    #++++++++++++++++++++++++++++++++++++++++++
     # Init the model
-    print "Init a model for learning, with kmean for initialization"
     lgm = GM(d, k, mode)
     gmm = GMM(lgm, 'kmean')
     gmm.init(data)
 
-    # Keep the initialized model for drawing
-    gm0 = copy.copy(lgm)
-
+    gm0    = copy.copy(gmm.gm)
     # The actual EM, with likelihood computation
     like    = N.zeros(emiter)
-    print "computing..."
     for i in range(emiter):
         g, tgd  = gmm.sufficient_statistics(data)
         like[i] = N.sum(N.log(N.sum(tgd, 1)), axis = 0)
         gmm.update_em(data, g)
 
-    #+++++++++++++++++++++++++++++++
-    # Learn the model with Online EM
-    #+++++++++++++++++++++++++++++++
+    #++++++++++++++++++++++++++++++++++++++++
+    # Approximate the models with online EM
+    #++++++++++++++++++++++++++++++++++++++++
     ogm     = GM(d, k, mode)
     ogmm    = OnGMM(ogm, 'kmean')
-    #init_data   = data[0:nframes / 20, :]
-    init_data   = data
+    init_data   = data[0:nframes / 20, :]
     ogmm.init(init_data)
 
-    ogm2    = GM(d, k, mode)
-    ogmm2   = OnGMM(ogm2, 'kmean')
-    #init_data   = data[0:nframes / 20, :]
-    init_data   = data
-    ogmm2.init(init_data)
-
-    ogm0    = copy.copy(ogm)
-    assert_array_equal(ogm0.w, gm0.w)
-    assert_array_equal(ogm0.mu, gm0.mu)
-    assert_array_equal(ogm0.va, gm0.va)
-
-    ogm02   = copy.copy(ogm2)
-    assert_array_equal(ogm02.w, gm0.w)
-    assert_array_equal(ogm02.mu, gm0.mu)
-    assert_array_equal(ogm02.va, gm0.va)
-
-    w0  = gm0.w.copy()
-    mu0 = gm0.mu.copy()
-    va0 = gm0.va.copy()
-
-    cx  = ogmm.cx
-    cxx = ogmm.cxx
-    
-    cw  = w0.copy()
-    cmu = mu0.copy()
-    cva = va0.copy()
-    
-    pw  = cw.copy()
-    pmu = cmu.copy()
-    pva = cva.copy()
-
     # Forgetting param
     ku		= 0.005
     t0		= 200
-    lamb	= N.ones((nframes, 1))
-    lamb[0] = 0
-    nu0		= 1.0
-    #lamb	= 1 - 1/(N.arange(-1, nframes-1) * ku + t0)
-    #nu0		= 0.2
+    lamb	= 1 - 1/(N.arange(-1, nframes-1) * ku + t0)
+    nu0		= 0.2
     nu		= N.zeros((len(lamb), 1))
     nu[0]	= nu0
     for i in range(1, len(lamb)):
         nu[i]	= 1./(1 + lamb[i] / nu[i-1])
 
-    gamma   = N.zeros((nframes, k))
-    agamma  = []
-    apw     = []
-    apmu    = []
-    apva    = []
-    print "========== Online Manual ==========="
-    for e in range(emiter):
-        print "online manual: estep %d, printing p* state " % e
-        apw.append(pw.copy())
-        apmu.append(pmu.copy())
-        apva.append(pva.copy())
-        for t in range(nframes):
-            gamma[t]    = multiple_gauss_den(data[t:t+1, :], pmu, pva)[0]
-            gamma[t]    *= pw
-            gamma[t]    /= N.sum(gamma[t])
-            # <1>(t) = cw(t), each column is one component cw = (cw1, ..., cwK);
-            # <x>(t) = cx(t)
-            cw	= (1 - nu[t]) * cw + nu[t] * gamma[t]
-            # loop through each component
-            for i in range(k):
-                cx[i]   = (1 - nu[t]) * cx[i] + nu[t] * data[t, :] * gamma[t, i]
-                cxx[i]  = (1 - nu[t]) * cxx[i] + nu[t] * data[t, :] ** 2 * gamma[t, i]
+    # object version of online EM
+    for t in range(nframes):
+        gamma   = ogmm.sufficient_statistics(data[t:t+1, :], nu[t])
+        ogmm.update_em(data[t, :], gamma, nu[t])
 
-                cmu[i]  = cx[i] / cw[i]
-                cva[i]  = cxx[i] / cw[i] - cmu[i] ** 2
+    ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva)
 
-            #pw  = cw.copy()
-            #pmu = cmu.copy()
-            #pva = cva.copy()
-        print "gamma[end]: " + str(gamma[-1])
-        pw  = cw.copy()
-        pmu = cmu.copy()
-        pva = cva.copy()
-        agamma.append(gamma.copy())
+    #+++++++++++++++
+    # Draw the model
+    #+++++++++++++++
+    print "drawing..."
+    import pylab as P
+    P.subplot(2, 1, 1)
 
-    gamma2  = N.zeros((nframes, k))
-    agamma2 = []
-    apw2    = []
-    apmu2   = []
-    apva2   = []
-    print "========== Online Automatic ==========="
-    for e in range(emiter):
-        print "online automatic: estep %d, printing p* state " % e
-        apw2.append(ogmm2.pw.copy())
-        apmu2.append(ogmm2.pmu.copy())
-        apva2.append(ogmm2.pva.copy())
-        for t in range(nframes):
-            gamma2[t]   = ogmm2.sufficient_statistics(data[t:t+1, :], nu[t])
-            #gamma2[t]   = multiple_gauss_den(data[t:t+1, :], ogmm2.pmu, ogmm2.pva)[0]
-            #gamma2[t]   *= ogmm2.pw
-            #gamma2[t]   /= N.sum(gamma2[t])
-            #try:
-            #    assert_array_equal(agamma, gamma2[t])
-            #except AssertionError:
-            #    print "error for estep %d, step %d" % (e, t)
-            #    print ogmm2.pw
-            #    print ogmm2.pmu
-            #    print ogmm2.pva
-            #    raise 
-            ogmm2.update_em(data[t, :], gamma2[t], nu[t])
-            #ogmm2.cw	= (1 - nu[t]) * ogmm2.cw + nu[t] * agamma
-            ## loop through each component
-            #for i in range(k):
-            #    ogmm2.cx[i]   = (1 - nu[t]) * ogmm2.cx[i] + nu[t] * data[t, :] * agamma[i]
-            #    ogmm2.cxx[i]  = (1 - nu[t]) * ogmm2.cxx[i] + nu[t] * data[t, :] ** 2 * agamma[i]
+    if not d == 1:
+        # Draw what is happening
+        P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
 
-            #    ogmm2.cmu[i]  = ogmm2.cx[i] / ogmm2.cw[i]
-            #    ogmm2.cva[i]  = ogmm2.cxx[i] / ogmm2.cw[i] - ogmm2.cmu[i] ** 2
+        h   = gm.plot()    
+        [i.set_color('g') for i in h]
+        h[0].set_label('true confidence ellipsoides')
 
-        print "gamma[end]: " + str(gamma2[-1])
-        agamma2.append(gamma2.copy())
-        ogmm2.pw  = ogmm2.cw.copy()
-        ogmm2.pmu = ogmm2.cmu.copy()
-        ogmm2.pva = ogmm2.cva.copy()
+        h   = gm0.plot()    
+        [i.set_color('k') for i in h]
+        h[0].set_label('initial confidence ellipsoides')
 
-    # #ogm.set_param(pw, pmu, pva)
-    # print "weights off vs on: \n" + str(lgm.w) + "\n " + str(cw)
-    # print "mean off vs on: \n" + str(lgm.mu) + "\n " + str(cmu)
-    # print "variances off vs on: \n" + str(lgm.va) + "\n " + str(cva)
-    # print "weights off vs on2: \n" + str(lgm.w) + "\n " + str(ogmm2.cw)
-    # print "mean off vs on2: \n" + str(lgm.mu) + "\n " + str(ogmm2.cmu)
-    # print "variances off vs on2: \n" + str(lgm.va) + "\n " + str(ogmm2.cva)
-    # assert_array_almost_equal(cw, lgm.w)
-    # assert_array_almost_equal(cmu, lgm.mu)
-    # assert_array_almost_equal(cva, lgm.va)
-    assert_array_equal(ogmm2.pw, pw)
-    assert_array_equal(ogmm2.pmu, pmu)
-    assert_array_equal(ogmm2.pva, pva)
-    assert_array_equal(agamma[0], agamma2[0])
-    #assert_array_almost_equal(ogmm2.cw, lgm.w)
-    #assert_array_almost_equal(ogmm2.cmu, lgm.mu)
-    #assert_array_almost_equal(ogmm2.cva, lgm.va)
-    # #+++++++++++++++
-    # # Draw the model
-    # #+++++++++++++++
-    # print "drawing..."
-    # import pylab as P
-    # P.subplot(2, 1, 1)
+        h   = lgm.plot()    
+        [i.set_color('r') for i in h]
+        h[0].set_label('confidence ellipsoides found by EM')
 
-    # if d == 1:
-    #     # Real confidence ellipses
-    #     h   = gm.plot1d()
-    #     [i.set_color('g') for i in h['pdf']]
-    #     h['pdf'][0].set_label('true pdf')
+        h   = ogmm.gm.plot()    
+        [i.set_color('m') for i in h]
+        h[0].set_label('confidence ellipsoides found by Online EM')
 
-    #     # Initial confidence ellipses as found by kmean
-    #     h0  = gm0.plot1d()
-    #     [i.set_color('k') for i in h0['pdf']]
-    #     h0['pdf'][0].set_label('initial pdf')
+        # P.legend(loc = 0)
+    else:
+        # Real confidence ellipses
+        h   = gm.plot1d()
+        [i.set_color('g') for i in h['pdf']]
+        h['pdf'][0].set_label('true pdf')
 
-    #     # Values found by EM
-    #     hl  = lgm.plot1d(fill = 1, level = 0.66)
-    #     [i.set_color('r') for i in hl['pdf']]
-    #     hl['pdf'][0].set_label('pdf found by EM')
+        # Initial confidence ellipses as found by kmean
+        h0  = gm0.plot1d()
+        [i.set_color('k') for i in h0['pdf']]
+        h0['pdf'][0].set_label('initial pdf')
 
-    #     # Values found by Online EM
-    #     ho  = ogm.plot1d(fill = 1, level = 0.66)
-    #     [i.set_color('b') for i in ho['pdf']]
-    #     ho['pdf'][0].set_label('pdf found by online EM')
+        # Values found by EM
+        hl  = lgm.plot1d(fill = 1, level = 0.66)
+        [i.set_color('r') for i in hl['pdf']]
+        hl['pdf'][0].set_label('pdf found by EM')
 
-    #     P.legend(loc = 0)
-    # else:
-    #     # Draw what is happening
-    #     P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
+        P.legend(loc = 0)
 
-    #     # Real confidence ellipses
-    #     Xre, Yre  = gm.conf_ellipses()
-    #     P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
-    #     for i in range(1,k):
-    #         P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+        # Values found by Online EM
+        hl  = ogmm.gm.plot1d(fill = 1, level = 0.66)
+        [i.set_color('m') for i in hl['pdf']]
+        hl['pdf'][0].set_label('pdf found by Online EM')
 
-    #     # Initial confidence ellipses as found by kmean
-    #     X0e, Y0e  = gm0.conf_ellipses()
-    #     P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
-    #     for i in range(1,k):
-    #         P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
+        P.legend(loc = 0)
 
-    #     # Values found by EM
-    #     Xe, Ye  = lgm.conf_ellipses()
-    #     P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
-    #     for i in range(1,k):
-    #         P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
-    #     P.legend(loc = 0)
-
-    #     # Values found by Online EM
-    #     Xe, Ye  = ogm.conf_ellipses()
-    #     P.plot(Xe[0], Ye[0], 'm', label = 'confidence ellipsoides found by Online EM')
-    #     for i in range(1,k):
-    #         P.plot(Xe[i], Ye[i], 'm', label = '_nolegend_')
-    #     P.legend(loc = 0)
-
-
-    # P.subplot(2, 1, 2)
-    # P.plot(like)
-    # P.title('log likelihood')
-
-    # P.show()
+    P.subplot(2, 1, 2)
+    P.plot(nu)
+    P.title('Learning rate')
+    P.show()

Modified: trunk/Lib/sandbox/pyem/setup.py
===================================================================
--- trunk/Lib/sandbox/pyem/setup.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/setup.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -1,5 +1,5 @@
 #! /usr/bin/env python
-# Last Change: Thu Oct 12 11:00 PM 2006 J
+# Last Change: Thu Oct 19 07:00 PM 2006 J
 # TODO:
 #   - check how to handle cmd line build options with distutils and use
 #   it in the building process
@@ -34,8 +34,7 @@
 if __name__ == "__main__":
     from numpy.distutils.core import setup
     #setup(**configuration(top_path='').todict())
-    setup(**configuration(top_path='',
-                          package_name='scipy.sandbox.pyem').todict())
+    setup(**configuration(top_path='',))
 # from distutils.core import setup, Extension
 # from pyem import version as pyem_version
 # 

Added: trunk/Lib/sandbox/pyem/tests/generate_test_data.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/generate_test_data.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/tests/generate_test_data.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -0,0 +1,53 @@
+# Last Change: Wed Oct 18 06:00 PM 2006 J
+
+import numpy as N
+import tables as T
+
+from numpy.random import seed
+
+from gmm_em import multiple_gauss_den
+from gauss_mix import GM
+from _c_densities import gauss_den
+
+filename    = 'test_mgden.h5';
+h5file      = T.openFile(filename, 'w')
+h5file.createGroup(h5file.root, 'hyperparams')
+h5file.createGroup(h5file.root, 'params')
+h5file.createGroup(h5file.root, 'data')
+
+d       = 1
+k       = 2
+type    = 'diag'
+nframes = int(1e3)
+
+h5file.createArray(h5file.root.hyperparams, 'dimension', d)
+h5file.createArray(h5file.root.hyperparams, 'type', type)
+h5file.createArray(h5file.root.hyperparams, 'nclusters', k)
+
+w, mu, va   = GM.gen_param(d, k, type)
+
+h5file.createArray(h5file.root.params, 'weights', w)
+h5file.createArray(h5file.root.params, 'means', mu)
+h5file.createArray(h5file.root.params, 'variances', va)
+
+gm      = GM.fromvalues(w, mu, va)
+# Sample nframes frames  from the model
+data    = gm.sample(nframes)
+
+h5file.createArray(h5file.root.data, 'data', data)
+
+w1, mu1, va1    = GM.gen_param(d, k, type)
+
+out     = multiple_gauss_den(data, mu1, va1)
+out1    = gauss_den(data, mu1[0, :], va1[0, :])
+
+h5file.createArray(h5file.root.params, 'w', w1)
+h5file.createArray(h5file.root.params, 'mu', mu1)
+h5file.createArray(h5file.root.params, 'va', va1)
+h5file.createArray(h5file.root.data, 'out', out)
+
+h5file.createArray(h5file.root.params, 'mu1', mu1[0,:])
+h5file.createArray(h5file.root.params, 'va1', va1[0,:])
+h5file.createArray(h5file.root.data, 'out1', out1)
+
+h5file.close()

Modified: trunk/Lib/sandbox/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_densities.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/tests/test_densities.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -1,6 +1,11 @@
 #! /usr/bin/env python
-# Last Change: Fri Sep 29 06:00 PM 2006 J
+# Last Change: Thu Oct 19 07:00 PM 2006 J
 
+# TODO:
+#   - having "fake tests" to check that all mode (scalar, diag and full) are
+#   executables
+#   - having a dataset to check against
+
 import sys
 from numpy.testing import *
 
@@ -8,8 +13,6 @@
 
 set_package_path()
 from pyem.densities import gauss_den
-from pyem._c_densities import gauss_den as c_gauss_den
-
 restore_path()
 
 #Optional:
@@ -17,7 +20,9 @@
 # import modules that are located in the same directory as this file.
 restore_path()
 
-class test_densities(NumpyTestCase):
+DEF_DEC = 12
+
+class TestDensities(NumpyTestCase):
     def _generate_test_data_1d(self):
         self.va     = 2.0
         self.mu     = 1.0
@@ -61,37 +66,44 @@
             0.00378789836599, 0.00015915297541, 
             0.00000253261067, 0.00000001526368])
 
-    def _check_py(self, level, decimal = 12):
+class test_py_implementation(TestDensities):
+    def _check(self, level, decimal = DEF_DEC):
         Y   = gauss_den(self.X, self.mu, self.va)
         assert_array_almost_equal(Y, self.Yt, decimal)
 
-    def _check_c(self, level, decimal = 12):
-        Y   = c_gauss_den(self.X, self.mu, self.va)
-        assert_array_almost_equal(Y, self.Yt, decimal)
+    def check_2d_diag(self, level = 0):
+        self._generate_test_data_2d_diag()
+        self._check(level)
 
-    def check_py_1d(self, level = 1):
+    def check_2d_full(self, level = 0):
+        self._generate_test_data_2d_full()
+        self._check(level)
+    
+    def check_py_1d(self, level = 0):
         self._generate_test_data_1d()
-        self._check_py(level)
+        self._check(level)
 
-    def check_py_2d_diag(self, level = 1):
-        self._generate_test_data_2d_diag()
-        self._check_py(level)
+class test_c_implementation(TestDensities):
+    def _check(self, level, decimal = DEF_DEC):
+        try:
+            from pyem._c_densities import gauss_den as c_gauss_den
+            Y   = c_gauss_den(self.X, self.mu, self.va)
+            assert_array_almost_equal(Y, self.Yt, decimal)
+        except ImportError, inst:
+            print "Error while importing C implementation, not tested"
+            print " -> (Import error was %s)" % inst 
 
-    def check_py_2d_full(self, level = 1):
-        self._generate_test_data_2d_full()
-        self._check_py(level)
-
-    def check_c_1d(self, level = 1):
+    def check_1d(self, level = 0):
         self._generate_test_data_1d()
-        self._check_c(level)
+        self._check(level)
 
-    def check_c_2d_diag(self, level = 1):
+    def check_2d_diag(self, level = 0):
         self._generate_test_data_2d_diag()
-        self._check_c(level)
+        self._check(level)
 
-    def check_c_2d_full(self, level = 1):
+    def check_2d_full(self, level = 0):
         self._generate_test_data_2d_full()
-        self._check_c(level)
+        self._check(level)
 
 if __name__ == "__main__":
     NumpyTest().run()

Added: trunk/Lib/sandbox/pyem/tests/test_gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_gmm_em.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/tests/test_gmm_em.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -0,0 +1,11 @@
+#! /usr/bin/env python
+# Last Change: Fri Oct 20 11:00 AM 2006 J
+
+import sys
+from numpy.testing import *
+
+import numpy as N
+
+set_package_path()
+from pyem.densities import gauss_den
+restore_path()

Added: trunk/Lib/sandbox/pyem/tests/test_online_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_online_em.py	2006-10-18 19:27:50 UTC (rev 2290)
+++ trunk/Lib/sandbox/pyem/tests/test_online_em.py	2006-10-20 04:01:05 UTC (rev 2291)
@@ -0,0 +1,188 @@
+#! /usr/bin/env python
+# Last Change: Fri Oct 20 12:00 PM 2006 J
+
+import copy
+
+import sys
+from numpy.testing import *
+
+import numpy as N
+from numpy.random import seed
+
+set_package_path()
+from pyem import GM, GMM
+from pyem.online_em import OnGMM
+restore_path()
+
+# #Optional:
+# set_local_path()
+# # import modules that are located in the same directory as this file.
+# restore_path()
+
+# Error precision allowed (nb of decimals)
+AR_AS_PREC  = 12
+
+class OnlineEmTest(NumpyTestCase):
+    def _create_model(self, d, k, mode, nframes, emiter):
+        #+++++++++++++++++++++++++++++++++++++++++++++++++
+        # Generate a model with k components, d dimensions
+        #+++++++++++++++++++++++++++++++++++++++++++++++++
+        w, mu, va   = GM.gen_param(d, k, mode, spread = 1.5)
+        gm          = GM.fromvalues(w, mu, va)
+        # Sample nframes frames  from the model
+        data        = gm.sample(nframes)
+
+        #++++++++++++++++++++++++++++++++++++++++++
+        # Approximate the models with classical EM
+        #++++++++++++++++++++++++++++++++++++++++++
+        # Init the model
+        lgm = GM(d, k, mode)
+        gmm = GMM(lgm, 'kmean')
+        gmm.init(data)
+
+        self.gm0    = copy.copy(gmm.gm)
+        # The actual EM, with likelihood computation
+        for i in range(emiter):
+            g, tgd  = gmm.sufficient_statistics(data)
+            gmm.update_em(data, g)
+
+        self.data   = data
+        self.gm     = lgm
+    
+class test_on_off_eq(OnlineEmTest):
+    def check_1d(self, level = 1):
+        d       = 1
+        k       = 2
+        mode    = 'diag'
+        nframes = int(1e2)
+        emiter  = 3
+
+        seed(1)
+        self._create_model(d, k, mode, nframes, emiter)
+        self._check(d, k, mode, nframes, emiter)
+
+    def check_2d(self, level = 2):
+        d       = 2
+        k       = 2
+        mode    = 'diag'
+        nframes = int(1e2)
+        emiter  = 3
+
+        seed(1)
+        self._create_model(d, k, mode, nframes, emiter)
+        self._check(d, k, mode, nframes, emiter)
+
+    def check_5d(self, level = 2):
+        d       = 5
+        k       = 2
+        mode    = 'diag'
+        nframes = int(1e2)
+        emiter  = 3
+
+        seed(1)
+        self._create_model(d, k, mode, nframes, emiter)
+        self._check(d, k, mode, nframes, emiter)
+
+    def _check(self, d, k, mode, nframes, emiter):
+        #++++++++++++++++++++++++++++++++++++++++
+        # Approximate the models with online EM
+        #++++++++++++++++++++++++++++++++++++++++
+        # Learn the model with Online EM
+        ogm         = GM(d, k, mode)
+        ogmm        = OnGMM(ogm, 'kmean')
+        init_data   = self.data
+        ogmm.init(init_data)
+
+        # Check that online kmean init is the same than kmean offline init
+        ogm0    = copy.copy(ogm)
+        assert_array_equal(ogm0.w, self.gm0.w)
+        assert_array_equal(ogm0.mu, self.gm0.mu)
+        assert_array_equal(ogm0.va, self.gm0.va)
+
+        # Forgetting param
+        lamb	= N.ones((nframes, 1))
+        lamb[0] = 0
+        nu0		= 1.0
+        nu		= N.zeros((len(lamb), 1))
+        nu[0]	= nu0
+        for i in range(1, len(lamb)):
+            nu[i]	= 1./(1 + lamb[i] / nu[i-1])
+
+        # object version of online EM: the p* arguments are updated only at each 
+        # epoch, which is equivalent to on full EM iteration on the 
+        # classic EM algorithm
+        ogmm.pw    = ogmm.cw.copy()
+        ogmm.pmu   = ogmm.cmu.copy()
+        ogmm.pva   = ogmm.cva.copy()
+        for e in range(emiter):
+            for t in range(nframes):
+                gamma   = ogmm.sufficient_statistics(self.data[t:t+1, :], nu[t])
+                ogmm.update_em(self.data[t, :], gamma, nu[t])
+
+            # Change pw args only a each epoch 
+            ogmm.pw  = ogmm.cw.copy()
+            ogmm.pmu = ogmm.cmu.copy()
+            ogmm.pva = ogmm.cva.copy()
+
+        # For equivalence between off and on, we allow a margin of error,
+        # because of round-off errors.
+        print " Checking precision of equivalence with offline EM trainer "
+        maxtestprec = 18
+        try :
+            for i in range(maxtestprec):
+                    assert_array_almost_equal(self.gm.w, ogmm.pw, decimal = i)
+                    assert_array_almost_equal(self.gm.mu, ogmm.pmu, decimal = i)
+                    assert_array_almost_equal(self.gm.va, ogmm.pva, decimal = i)
+            print "\t !! Precision up to %d decimals !! " % i
+        except AssertionError:
+            if i < AR_AS_PREC:
+                print """\t !!NOT OK: Precision up to %d decimals only, 
+                    outside the allowed range (%d) !! """ % (i, AR_AS_PREC)
+                raise AssertionError
+            else:
+                print "\t !!OK: Precision up to %d decimals !! " % i
+
+class test_on(OnlineEmTest):
+    def check_consistency(self):
+        d       = 1
+        k       = 2
+        mode    = 'diag'
+        nframes = int(1e3)
+        emiter  = 4
+
+        self._create_model(d, k, mode, nframes, emiter)
+        self._run_pure_online(d, k, mode, nframes)
+    
+    def _run_pure_online(self, d, k, mode, nframes):
+        #++++++++++++++++++++++++++++++++++++++++
+        # Approximate the models with online EM
+        #++++++++++++++++++++++++++++++++++++++++
+        ogm     = GM(d, k, mode)
+        ogmm    = OnGMM(ogm, 'kmean')
+        init_data   = self.data[0:nframes / 20, :]
+        ogmm.init(init_data)
+
+        # Forgetting param
+        ku		= 0.005
+        t0		= 200
+        lamb	= 1 - 1/(N.arange(-1, nframes-1) * ku + t0)
+        nu0		= 0.2
+        nu		= N.zeros((len(lamb), 1))
+        nu[0]	= nu0
+        for i in range(1, len(lamb)):
+            nu[i]	= 1./(1 + lamb[i] / nu[i-1])
+
+        # object version of online EM
+        for t in range(nframes):
+            # the assert are here to check we do not create copies
+            # unvoluntary for parameters
+            assert ogmm.pw is ogmm.cw
+            assert ogmm.pmu is ogmm.cmu
+            assert ogmm.pva is ogmm.cva
+            gamma   = ogmm.sufficient_statistics(self.data[t:t+1, :], nu[t])
+            ogmm.update_em(self.data[t, :], gamma, nu[t])
+
+        ogmm.gm.set_param(ogmm.cw, ogmm.cmu, ogmm.cva)
+
+if __name__ == "__main__":
+    NumpyTest().run()



More information about the Scipy-svn mailing list