[Scipy-svn] r2271 - in trunk/Lib/sandbox/pyem: . pyem

scipy-svn at scipy.org scipy-svn at scipy.org
Thu Oct 12 08:46:05 CDT 2006


Author: cdavid
Date: 2006-10-12 08:45:58 -0500 (Thu, 12 Oct 2006)
New Revision: 2271

Added:
   trunk/Lib/sandbox/pyem/pyem/gauss_mix.py
Modified:
   trunk/Lib/sandbox/pyem/Changelog
   trunk/Lib/sandbox/pyem/pyem/__init__.py
   trunk/Lib/sandbox/pyem/pyem/gmm_em.py
Log:
[pyem @ david at ar.media.kyoto-u.ac.jp-20060714072645-a3b4e773c4c589cf]
Refactoring of EM into classes
David Cournapeau <david at ar.media.kyoto-u.ac.jp> | 2006-07-14 16:26:45 +0900 (Fri, 14 Jul 2006)

Modified: trunk/Lib/sandbox/pyem/Changelog
===================================================================
--- trunk/Lib/sandbox/pyem/Changelog	2006-10-12 13:45:51 UTC (rev 2270)
+++ trunk/Lib/sandbox/pyem/Changelog	2006-10-12 13:45:58 UTC (rev 2271)
@@ -1,3 +1,13 @@
+pyem (0.4) Fri, 14 Jul 2006 16:24:05 +0900
+
+	* put version to 0.4
+	* Heavy refactoring of EM and GMM into classes (see below)
+	* add a module gauss_mix, which implements Gaussian Mixtures.
+	* GMM is now a class, which is initialized using a Gaussian Mixture.
+	a GMM can be trained.
+
+-- David Cournapeau <david at ar.media.kyoto-u.ac.jp> 
+
 pyem (0.3) Thu, 13 Jul 2006 19:48:54 +0900
 
 	* put version to 0.3
@@ -4,6 +14,6 @@
 	* Refactoring kmean code into new module. 
 	* Refactoring tests code into test module. 
 	* Replace matrixmultiply and outerproduct calls by dot to use fast BLAS if
-	available.
+	available. Not everything is done yet
 
 -- David Cournapeau <david at ar.media.kyoto-u.ac.jp> 

Modified: trunk/Lib/sandbox/pyem/pyem/__init__.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/__init__.py	2006-10-12 13:45:51 UTC (rev 2270)
+++ trunk/Lib/sandbox/pyem/pyem/__init__.py	2006-10-12 13:45:58 UTC (rev 2271)
@@ -1,4 +1,4 @@
 #! /usr/bin/env python
-# Last Change: Thu Jul 13 07:00 PM 2006 J
+# Last Change: Fri Jul 14 04:00 PM 2006 J
 
-version = '0.3'
+version = '0.4'

Added: trunk/Lib/sandbox/pyem/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gauss_mix.py	2006-10-12 13:45:51 UTC (rev 2270)
+++ trunk/Lib/sandbox/pyem/pyem/gauss_mix.py	2006-10-12 13:45:58 UTC (rev 2271)
@@ -0,0 +1,292 @@
+# /usr/bin/python
+# Last Change: Fri Jul 14 03:00 PM 2006 J
+
+# Module to implement GaussianMixture class.
+
+import numpy as N
+import numpy.linalg as lin
+import densities
+
+MAX_DEV = 1e-10
+
+# Right now, two main usages of a Gaussian Model are possible
+#   - init a Gaussian Model with meta-parameters, and trains it
+#   - set-up a Gaussian Model to sample it, draw ellipsoides 
+#   of confidences. In this case, we would like to init it with
+#   known values of parameters.
+#
+#   For now, we have to init with meta-parameters, and set 
+#   the parameters afterward. There should be a better way ?
+class GmParamError:
+    """Exception raised for errors in gmm params
+
+    Attributes:
+        expression -- input expression in which the error occurred
+        message -- explanation of the error
+    """
+    def __init__(self, message):
+        self.message    = message
+    
+    def __str__(self):
+        return self.message
+
+class GM:
+    """Gaussian Mixture class. This is a simple container class
+    to hold Gaussian Mixture parameters (weights, mean, etc...).
+    It can also draw itself (confidence ellipses) and samples itself.
+
+    Is initiated by giving dimension, number of components and 
+    covariance mode"""
+
+    # I am not sure it is useful to have a spherical mode...
+    _cov_mod    = ['diag', 'full']
+
+    def __init__(self, d, k, mode = 'diag'):
+        """Init a Gaussian model of k components, each component being a 
+        d multi-variate Gaussian, with covariance matrix of style mode"""
+        if mode not in self._cov_mod:
+            raise GmmParamError("mode %s not recognized" + str(mode))
+
+        self.d      = d
+        self.k      = k
+        self.mode   = mode
+
+        # Init to 0 all parameters, with the right dimensions.
+        # Not sure this is useful in python from an efficiency POV ?
+        self.w   = N.zeros(k, float)
+        self.mu  = N.zeros((k, d), float)
+        if mode == 'diag':
+            self.va  = N.zeros((k, d), float)
+        elif mode == 'full':
+            self.va  = N.zeros((k * d, d), float)
+
+        self.is_valid   = False
+
+    def set_param(self, weights, mu, sigma):
+        """Set parameters of the model"""
+        k, d, mode  = check_gmm_param(weights, mu, sigma)
+        if not k == self.k:
+            raise GmmParamError("Number of given components is %d, expected %d" 
+                    % (shape(k), shape(self.k)))
+        if not d == self.d:
+            raise GmmParamError("Dimension of the given model is %d, expected %d" 
+                    % (shape(d), shape(self.d)))
+        if not mode == self.mode:
+            raise GmmParamError("Given covariance mode is %s, expected %d"
+                    % (mode, self.mode))
+        self.w  = weights
+        self.mu = mu
+        self.va = sigma
+
+        self.is_valid   = True
+
+    def sample(self, nframes):
+        """ Sample nframes frames from the model """
+        if not self.is_valid:
+            raise GmmParamError("""Parameters of the model has not been 
+                set yet, please set them using self.set_param()""")
+
+        # State index (ie hidden var)
+        S   = gen_rand_index(self.w, nframes)
+        # standard gaussian
+        X   = N.randn(nframes, self.d)        
+
+        if self.mode == 'diag':
+            X   = self.mu[S, :]  + X * N.sqrt(self.va[S,:])
+        elif self.mode == 'full':
+            # Faster:
+            cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1]), float)
+            for i in range(self.k):
+                # Using cholesky looks more stable than sqrtm; sqrtm is not
+                # available in numpy anyway, only in scipy...
+                cho[i]  = lin.cholesky(self.va[i*self.d:i*self.d+self.d,:])
+
+            for s in range(self.k):
+                tmpind      = N.where(S == s)[0]
+                X[tmpind]   = N.dot(X[tmpind], cho[s].transpose()) + self.mu[s]
+        else:
+            raise GmmParamError('cov matrix mode not recognized, this is a bug !')
+
+        return X
+
+    def conf_ellipses(self, c = [0, 1], npoints = 100):
+        """Returns a list of confidence ellipsoids describing the Gmm
+        defined by mu and va. c is the dimension we are projecting
+        the variances on a 2d space. For now, the confidence level
+        is fixed to 0.39.
+        
+        Returns:
+            -Xe:    a list of x coordinates for the ellipses
+            -Ye:    a list of y coordinates for the ellipses
+
+        Example:
+            Suppose we have w, mu and va as parameters for a mixture, then:
+            
+            gm      = GM(d, k)
+            gm.set_param(w, mu, va)
+            X       = gm.sample(1000)
+            Xe, Ye  = gm.conf_ellipsoids()
+            pylab.plot(X[:,0], X[:, 1], '.')
+            for k in len(w):
+                pylab.plot(Xe[k], Ye[k], 'r')
+                
+            Will plot samples X draw from the mixture model, and
+            plot the ellipses of equi-probability from the mean with
+            fixed level of confidence 0.39.  """
+        # TODO: adjustable level (to do in gauss_ell). 
+        # For now, a level of 0.39 means that we draw
+        # ellipses for 1 standard deviation. 
+        Xe  = []
+        Ye  = []   
+        if self.mode == 'diag':
+            for i in range(self.k):
+                xe, ye  = densities.gauss_ell(self.mu[i,:], self.va[i,:], dim = c, 
+                        npoints = npoints)
+                Xe.append(xe)
+                Ye.append(ye)
+        elif self.mode == 'full':
+            for i in range(self.k):
+                xe, ye  = densities.gauss_ell(self.mu[i,:], 
+                        self.va[i*self.d:i*self.d+self.d,:], dim = c, 
+                        npoints = npoints)
+                Xe.append(xe)
+                Ye.append(ye)
+
+        return Xe, Ye
+    
+    def gen_param(self, d, nc, varmode = 'diag', spread = 1):
+        """Generate valid parameters for a gaussian mixture model.
+        d is the dimension, nc the number of components, and varmode
+        the mode for cov matrices.
+
+        This is a class method.
+
+        Returns: w, mu, va
+        """
+        w   = abs(N.randn(nc))
+        w   = w / sum(w)
+
+        mu  = spread * N.randn(nc, d)
+        if varmode == 'diag':
+            va  = abs(N.randn(nc, d))
+        elif varmode == 'full':
+            va  = N.randn(nc * d, d)
+            for k in range(nc):
+                va[k*d:k*d+d]   = N.dot( va[k*d:k*d+d], 
+                    va[k*d:k*d+d].transpose())
+        else:
+            raise GmmParamError('cov matrix mode not recognized')
+
+        return w, mu, va
+
+    gen_param = classmethod(gen_param)
+
+
+# Function to generate a random index: this is kept outside any class,
+# as the function can be useful for other
+def gen_rand_index(p, n):
+    """Generate a N samples vector containing random index between 1 
+    and length(p), each index i with probability p(i)"""
+    # TODO Check args here
+    
+    # TODO: check each value of inverse distribution is
+    # different
+    invcdf  = N.cumsum(p)
+    uni     = N.rand(n)
+    index   = N.zeros(n)
+
+    # This one should be a bit faster
+    for k in range(len(p)-1, 0, -1):
+        blop        = N.where(N.logical_and(invcdf[k-1] <= uni, 
+                    uni < invcdf[k]))
+        index[blop] = k
+        
+    return index
+
+def check_gmm_param(w, mu, va):
+    """Check that w, mu and va are valid parameters for
+    a mixture of gaussian: w should sum to 1, there should
+    be the same number of component in each param, the variances
+    should be positive definite, etc... 
+    
+    Params:
+        w   = vector or list of weigths of the mixture (K elements)
+        mu  = matrix: K * d
+        va  = list of variances (vector K * d or square matrices Kd * d)
+
+    returns:
+        K   = number of components
+        d   = dimension
+        mode    = 'diag' if diagonal covariance, 'full' of full matrices
+    """
+        
+    # Check that w is valid
+    if N.fabs(N.sum(w)  - 1) > MAX_DEV:
+        raise GmmParamError('weight does not sum to 1')
+    
+    if not len(w.shape) == 1:
+        raise GmmParamError('weight is not a vector')
+
+    # Check that mean and va have the same number of components
+    K           = len(w)
+
+    if N.ndim(mu) < 2:
+        msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
+        raise GmmParamError(msg)
+    if N.ndim(va) < 2:
+        msg = """va should be a K,d / K *d, d matrix, and a row vector if
+        only 1 diag comp"""
+        raise GmmParamError(msg)
+
+    (Km, d)     = mu.shape
+    (Ka, da)    = va.shape
+
+    if not K == Km:
+        msg = "not same number of component in mean and weights"
+        raise GmmParamError(msg)
+
+    if not d == da:
+        msg = "not same number of dimensions in mean and variances"
+        raise GmmParamError(msg)
+
+    if Km == Ka:
+        mode = 'diag'
+    else:
+        mode = 'full'
+        if not Ka == Km*d:
+            msg = "not same number of dimensions in mean and variances"
+            raise GmmParamError(msg)
+        
+    return K, d, mode
+        
+if __name__ == '__main__':
+    # Meta parameters:
+    #   - k = number of components
+    #   - d = dimension
+    #   - mode : mode of covariance matrices
+    d       = 5
+    k       = 5
+    mode    = 'full'
+    nframes = 1e3
+
+    # Build a model with random parameters
+    w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
+    gm          = GM(d, k, mode)
+    gm.set_param(w, mu, va)
+
+    # Sample nframes frames  from the model
+    X   = gm.sample(nframes)
+
+    # Plot
+    import pylab as P
+
+    P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
+
+    # Real confidence ellipses with level 0.39
+    Xre, Yre  = gm.conf_ellipses()
+    P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
+    for i in range(1,k):
+        P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
+
+    P.legend(loc = 0)
+    P.show()

Modified: trunk/Lib/sandbox/pyem/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gmm_em.py	2006-10-12 13:45:51 UTC (rev 2270)
+++ trunk/Lib/sandbox/pyem/pyem/gmm_em.py	2006-10-12 13:45:58 UTC (rev 2271)
@@ -1,19 +1,18 @@
 # /usr/bin/python
-# Last Change: Thu Jul 13 07:00 PM 2006 J
+# Last Change: Fri Jul 14 04:00 PM 2006 J
 
 import numpy as N
 import numpy.linalg as lin
 import densities
 from kmean import kmean
+from gauss_mix import GM
 
-MAX_DEV = 1e-10
-
 # Error classes
 class GmmError(Exception):
     """Base class for exceptions in this module."""
     pass
 
-class GmmParamError(GmmError):
+class GmmParamError:
     """Exception raised for errors in gmm params
 
     Attributes:
@@ -26,139 +25,202 @@
     def __str__(self):
         return self.message
 
-# Function to generate a GMM, or valid parameters for GMM
-def gen_rand_index(p, n):
-    """Generate a N samples vector containing random index between 1 
-    and length(p), each index i with probability p(i)"""
-    # TODO Check args here
+# Not sure yet about how to design different mixture models. Most of the code 
+# is different # (pdf, update part of EM, etc...) and I am not sure it makes 
+# sense to use inheritance for # interface specification in python, since its 
+# dynamic type systeme.
+
+# Anyway, a mixture class should encapsulates all details concerning a mixture model:
+#   - internal parameters for the pdfs
+#   - can compute sufficient statistics for EM
+#   - can sample a model
+#   - can generate random valid parameters for a new pdf (using class method)
+class MixtureModel:
+    pass
+
+class ExpMixtureModel(MixtureModel):
+    """Class to model mixture of exponential pdf (eg Gaussian, exponential, Laplace, 
+    etc..). This is a special case because some parts of EM are common for those
+    modelsi..."""
+    pass
+
+class GMM(ExpMixtureModel):
+    """ A class to model a Gaussian Mixture Model (GMM). An instance of 
+    this class is created by giving weights, mean and variances in the ctor.
+    An instanciated object can be sampled, trained by EM. 
     
-    # TODO: check each value of inverse distribution is
-    # different
-    invcdf  = N.cumsum(p)
-    uni     = N.rand(n)
-    index   = N.zeros(n)
+    The class method gen_model can be used without instanciation."""
 
-    # This one should be a bit faster
-    for k in range(len(p)-1, 0, -1):
-        blop        = N.where(N.logical_and(invcdf[k-1] <= uni, 
-                    uni < invcdf[k]))
-        index[blop] = k
+    def init_kmean(self, data, niter = 5):
+        """ Init the model with kmean."""
+        k       = self.gm.k
+        d       = self.gm.d
+        init    = data[0:k, :]
+
+        (code, label)   = kmean(data, init, niter)
+
+        w   = N.ones(k, float) / k
+        mu  = code.copy()
+        if self.gm.mode == 'diag':
+            va = N.zeros((k, d), float)
+            for i in range(k):
+                for j in range(d):
+                    va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
+        elif self.gm.mode == 'full':
+            va  = N.zeros((k*d, d), float)
+            for i in range(k):
+                va[i*d:i*d+d,:] = \
+                    N.cov(data[N.where(label==i)], rowvar = 0)
+        else:
+            raise GmmParamError("mode " + str(mode) + " not recognized")
+
+        self.gm.w   = w
+        self.gm.mu  = mu
+        self.gm.va  = va
+
+    def init_random(self, data):
+        """ Init the model at random."""
+        k   = self.gm.k
+        d   = self.gm.d
+        if mode == 'diag':
+            w   = N.ones(k, float) / k
+            mu  = N.randn(k, d)
+            va  = N.fabs(N.randn(k, d))
+        else:
+            raise GmmParamError("""init_random not implemented for
+                    mode %s yet""", mode)
+
+        self.gm.w   = w
+        self.gm.mu  = mu
+        self.gm.va  = va
+
+    # Possible init methods
+    _init_methods = {'kmean': init_kmean, 'random' : init_random}
+    # TODO: 
+    #   - format of parameters ? For variances, list of variances matrix,
+    #   keep the current format, have 3d matrices ?
+    #   - To handle the different modes, we could do something "fancy" such as
+    #   replacing methods, to avoid checking cases everywhere and unconsistency.
+    def __init__(self, gm, init = 'kmean'):
+        """ Initialize a GMM with weight w, mean mu and variances va, and initialization
+        method for training init (kmean by default)"""
+        self.gm = gm
+
+        if init not in self._init_methods:
+            raise GmmParamError('init method %s not recognized' + str(init))
+
+        GMM.init   = self._init_methods[init]
+
+    def sufficient_statistics(self, data):
+        """ Return normalized and non-normalized sufficient statistics
+        from the model.
         
-    return index
+        Computes the latent variable distribution (a 
+        posteriori probability) knowing the explicit data 
+        for the Gaussian model (w, mu, var): gamma(t, i) = 
+            P[state = i | observation = data(t); w, mu, va]
 
-def gen_gmm(w, mu, va, n):
-    """Generate a gaussiam mixture model with weights w, 
-    mean mu and variances va. Each column of the parameters
-    are one component parameter.
-    """
-    # Check args
-    K, d, mode  = check_gmm_param(w, mu, va)
+        This is basically the E step of EM for GMM."""
+        n   = data.shape[0]
 
-    # Generate the mixture
-    S   = gen_rand_index(w, n)  # State index (ie hidden var)
-    X   = N.randn(n, d)         # standard gaussian
+        # compute the gaussian pdf
+        tgd	= multiple_gauss_den(data, self.gm.mu, self.gm.va)
+        # multiply by the weight
+        tgd	*= self.gm.w
+        # Normalize to get a pdf
+        gd	= tgd  / N.sum(tgd, axis=1)[:, N.NewAxis]
 
-    if mode == 'diag':
-        X   = mu[S, :]  + X * N.sqrt(va[S,:])
-    elif mode == 'full':
-        # Faster:
-        cho = N.zeros((K, va.shape[1], va.shape[1]), float)
-        for k in range(K):
-            # Using cholesky is more stable than sqrtm; sqrtm is not
-            # available in numpy anyway, only in scipy...
-            cho[k]  = lin.cholesky(va[k*d:k*d+d,:])
+        return gd, tgd
 
-        for s in range(K):
-            tmpind      = N.where(S == s)[0]
-            X[tmpind]   = N.matrixmultiply(X[tmpind], cho[s].transpose()) + mu[s]
-    else:
-        raise GmmParamError('cov matrix mode not recognized')
+    def update_em(self, data, gamma):
+        """Computes update of the Gaussian Mixture Model (M step)
+        from the a posteriori pdf, computed by gmm_posterior
+        (E step).
+        """
+        k       = self.gm.k
+        d       = self.gm.d
+        n       = data.shape[0]
+        invn    = 1.0/n
+        mGamma  = N.sum(gamma)
 
-    return X
+        if self.gm.mode == 'diag':
+            mu  = N.zeros((k, d), float)
+            va  = N.zeros((k, d), float)
+            gamma   = gamma.transpose()
+            for c in range(k):
+                # x       = N.sum(N.outerproduct(gamma[:, k], 
+                #             N.ones((1, d))) * data)
+                # xx      = N.sum(N.outerproduct(gamma[:, k], 
+                #             N.ones((1, d))) * (data ** 2))
+                x   = N.dot(gamma[c:c+1,:], data)[0,:]
+                xx  = N.dot(gamma[c:c+1,:], data ** 2)[0,:]
 
-def gen_gmm_param(d, K, varmode = 'diag', spread = 1):
-    """Generate valid parameters for a gaussian mixture model.
-    d is the dimension, K the number of components, and varmode
-    the mode for cov matrices.
+                mu[c,:] = x / mGamma[c]
+                va[c,:] = xx  / mGamma[c] - mu[c,:] ** 2
+            w   = invn * mGamma
 
-    Returns:    
-        - w
-        - mu
-        - va
-    """
-    w   = abs(N.randn(K))
-    w   = w / sum(w)
+        elif self.gm.mode == 'full':
+            mu  = N.zeros((k, d), float)
+            va  = N.zeros((k*d, d), float)
 
-    mu  = spread * N.randn(K, d)
-    if varmode == 'diag':
-        va  = abs(N.randn(K, d))
-    elif varmode == 'full':
-        va  = N.randn(K * d, d)
-        for k in range(K):
-            va[k*d:k*d+d]   = N.matrixmultiply( va[k*d:k*d+d], 
-                va[k*d:k*d+d].transpose())
-    else:
-        raise GmmParamError('cov matrix mode not recognized')
+            for c in range(k):
+                x   = N.sum(N.outerproduct(gamma[:, c], 
+                            N.ones((1, d), float)) * data)
+                xx  = N.zeros((d, d), float)
+                
+                # This should be much faster than recursing on n...
+                for i in range(d):
+                    for j in range(d):
+                        xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[:,c])
 
-    return w, mu, va
+                mu[c,:] = x / mGamma[c]
+                va[c*d:c*d+d,:] = xx  / mGamma[c] - \
+                                    N.outerproduct(mu[c,:], mu[c,:])
+            w   = invn * mGamma
+        else:
+            raise GmmParamError("varmode not recognized")
 
-def check_gmm_param(w, mu, va):
-    """Check that w, mu and va are valid parameters for
-    a mixture of gaussian: w should sum to 1, there should
-    be the same number of component in each param, the variances
-    should be positive definite, etc... 
-    
-    Params:
-        w   = vector or list of weigths of the mixture (K elements)
-        mu  = matrix: K * d
-        va  = list of variances (vector K * d or square matrices Kd * d)
+        self.gm.set_param(w, mu, va)
 
-    returns:
-        K   = number of components
-        d   = dimension
-        mode    = 'diag' if diagonal covariance, 'full' of full matrices
-    """
-        
-    # Check that w is valid
-    if N.fabs(N.sum(w)  - 1) > MAX_DEV:
-        raise GmmParamError('weight does not sum to 1')
+class EM:
+    """An EM trainer. An EM trainer
+    trains from data, with a model
     
-    if not len(w.shape) == 1:
-        raise GmmParamError('weight is not a vector')
+    Not really useful yet"""
+    def __init__(self):
+        pass
 
-    # Check that mean and va have the same number of components
-    K           = len(w)
+    def train(self, data, model, niter = 10):
+        """
+        Train a model using data, with niter iterations.
 
-    if N.ndim(mu) < 2:
-        msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
-        raise GmmParamError(msg)
-    if N.ndim(va) < 2:
-        msg = """va should be a K,d / K *d, d matrix, and a row vector if
-        only 1 diag comp"""
-        raise GmmParamError(msg)
+        Args:
+            - data:     contains the observed features, one row is one frame, ie one 
+            observation of dimension d
+            - model:    object of class Mixture
+            - niter:    number of iterations
 
-    (Km, d)     = mu.shape
-    (Ka, da)    = va.shape
+        The model is trained, and its parameters updated accordingly.
 
-    if not K == Km:
-        msg = "not same number of component in mean and weights"
-        raise GmmParamError(msg)
+        Returns:
+            likelihood (one value per iteration).
+        """
 
-    if not d == da:
-        msg = "not same number of dimensions in mean and variances"
-        raise GmmParamError(msg)
+        # Initialize the data (may do nothing depending on the model)
+        model.init(data)
 
-    if Km == Ka:
-        mode = 'diag'
-    else:
-        mode = 'full'
-        if not Ka == Km*d:
-            msg = "not same number of dimensions in mean and variances"
-            raise GmmParamError(msg)
-        
-    return K, d, mode
-        
-# For EM on GMM
+        # Likelihood is kept
+        like    = N.zeros(niter, float)
+
+        # Em computation, with computation of the likelihood
+        for i in range(niter):
+            g, tgd      = model.sufficient_statistics(data)
+            like[i]     = N.sum(N.log(N.sum(tgd, 1)))
+            model.update_em(data, g)
+
+        return like
+    
+# Misc functions
 def multiple_gauss_den(data, mu, va):
     """Helper function to generate several Gaussian
     pdf (different parameters) from the same data"""
@@ -180,279 +242,99 @@
 
     return y
 
-def gmm_init_kmean(data, k, mode, init = [], niter = 10):
-    """gmm_init_kmean(data, k, mode, init = [], niter = 10)
-    
-    Init EM for GMM with kmean from data, for k components. 
-    
-    Args:
-        - data:     Each row of data is one frame of dimension d. 
-        - k:        Number of components to look for
-        - mode:     Diagonal or Full covariance matrices
-        - init:     The initial centroids. The value given for k is
-        ignored, and the number of row in initc is used instead. 
-        If initc is not given, then the centroids are initialized 
-        with the k first values of data.
-        - niter:    Number of iterations in kmean.
-    
-    Returns:
-        (w, mu, va), initial parameters for a GMM.
-
-    Method:
-        Each weight is equiprobable, each mean is one centroid returned by kmean, and
-    covariances for component i is initialized with covariance of 
-    data corresponding with label i. Other strategies are possible, this one
-    is an easy one"""
-    if len(init) == 0:
-        init   = data[0:k, :]
-    else:
-        k       = initc.shape[0]
-
-    if data.ndim == 1:
-        d   = 1
-    else:
-        d   = N.shape(data)[1]
-
-    (code, label)   = kmean(data, init, niter)
-
-    w   = N.ones(k, float) / k
-    mu  = code.copy()
-    if mode == 'diag':
-        va  = N.zeros((k, d), float)
-        for i in range(k):
-            for j in range(d):
-                va[i,j] = N.cov(data[N.where(label==i), j], rowvar = 0)
-    elif mode == 'full':
-        va  = N.zeros((k*d, d), float)
-        for i in range(k):
-            va[i*d:i*d+d,:] = N.cov(data[N.where(label==i)], rowvar = 0)
-    else:
-        raise GmmParamError("mode " + str(mode) + " not recognized")
-
-    return w, mu, va
-
-# This function is just calling gmm_update and gmm_posterior, with
-# initialization. This is ugly, and we should have a class to model a GMM
-# instead of all this code to try to guess correct values and parameters...
-def gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
-    """
-    gmm_em(data, niter = 10, k = 2, mode = 'diag', w = [], mu = [], va = []):
-
-    Compute the parameters of a Gaussian Mixture Model using EM algorithm, 
-    with initial values w, mu and va (overwritten by the function).
-
-    Args:
-        - data:     contains the observed features, one row is one frame, ie one 
-        observation of dimension d
-        - niter:    number of iterations
-        - mode:     'diag' or 'full', depending on the wanted model for cov
-        matrices.
-        - K:        number of components
-        - w, mu, va initial parameters for the GMM. All or none must be given.
-        If no initial values are given, initialized by gmm_init_kmean; if they
-        are given, mode and k are ignored, and guessed from the given parameters
-        instead.
-
-    Returns:
-        w, mu, va, like as found by EM, where like is the likelihood for each 
-        iteration.
-    """
-    if len(w) == 0:
-        w, mu, va   = gmm_init_kmean(data, k, mode, niter = 5)
-    k, d, mode  = check_gmm_param(w, mu, va)
-    
-    like    = N.zeros(niter, float)
-    for i in range(niter):
-        g, tgd      = gmm_posterior(data, w, mu, va)
-        like[i]     = N.sum(N.log(N.sum(tgd, 1)))
-        w, mu, va   = gmm_update(data, g, d, k, mode)
-
-    return w, mu, va, like
-    
-def gmm_posterior(data, w, mu, va):
-    """ Computes the latent variable distribution (a 
-    posteriori probability) knowing the explicit data 
-    for the Gaussian model (w, mu, var): gamma(t, i) = 
-        P[state = i | observation = data(t); w, mu, va]
-
-    This is basically the E step of EM for GMM.
-   
-    the second returned value is the non normalized version 
-    of gamma, and may be needed for some computation, 
-    like eg likelihood"""
-    n   = data.shape[0]
-    K   = len(w)
-
-    # compute the gaussian pdf
-    tgd	= multiple_gauss_den(data, mu, va)
-    # multiply by the weight
-    tgd	*= w
-    # Normalize to get a pdf
-    gd	= tgd  / N.sum(tgd, axis=1)[:, N.NewAxis]
-
-    return gd, tgd
-
-def gmm_update(data, gamma, d, K, varmode):
-    """Computes update of the Gaussian Mixture Model (M step)
-    from the a posteriori pdf, computed by gmm_posterior
-    (E step).
-    """
-    n       = data.shape[0]
-    invn    = 1.0/n
-    mGamma  = N.sum(gamma)
-
-    if varmode == 'diag':
-        mu  = N.zeros((K, d), float)
-        va  = N.zeros((K, d), float)
-        for k in range(K):
-            x       = N.sum(N.outerproduct(gamma[:, k], 
-                        N.ones((1, d))) * data)
-            xx      = N.sum(N.outerproduct(gamma[:, k], 
-                        N.ones((1, d))) * (data ** 2))
-
-            mu[k,:] = x / mGamma[k]
-            va[k,:] = xx  / mGamma[k] - mu[k,:] ** 2
-        w   = invn * mGamma
-
-    elif varmode == 'full':
-        mu  = N.zeros((K, d), float)
-        va  = N.zeros((K*d, d), float)
-
-        for k in range(K):
-            x   = N.sum(N.outerproduct(gamma[:, k], 
-                        N.ones((1, d), float)) * data)
-            xx  = N.zeros((d, d), float)
-            
-            # This should be much faster than reecursing on n...
-            for i in range(d):
-                for j in range(d):
-                    xx[i,j] = N.sum(data[:,i] * data[:,j] * gamma[:,k])
-
-            mu[k,:] = x / mGamma[k]
-            va[k*d:k*d+d,:] = xx  / mGamma[k] - \
-                                N.outerproduct(mu[k,:], mu[k,:])
-        w   = invn * mGamma
-    else:
-        raise GmmParamError("varmode not recognized")
-
-    return w, mu, va
-
-# Misc functions
-def gmm_ellipses(mu, va, c = [0, 1], npoints = 100):
-    """Returns a list of ellipses describing the Gmm
-    defined by mu and va. c is the dimension we are projecting
-    the variances on a 2d space.
-    
-    Returns:
-        -Xe:    a list of x coordinates for the ellipses
-        -Ye:    a list of y coordinates for the ellipses
-
-    Example:
-        Suppose we have w, mu and va as parameters for a mixture, then:
-        
-        X       = gen_gmm(w, mu, va, 1000)
-        Xe, Ye  = gmm_ellipses(mu, va)
-        pylab.plot(X[:,0], X[:, 1], '.')
-        for k in len(w):
-            pylab.plot(Xe[k], Ye[k], 'r')
-            
-        Will plot samples X draw from the mixture model, and
-        plot the ellipses of equi-probability from the mean with
-        fixed level of confidence 0.39. 
-        
-    TODO: be able to modify the confidence interval to arbitrary
-    value (to do in gauss_ell)"""
-    K   = mu.shape[0]
-    w   = N.ones(K, float) / K
-    
-    K, d, mode  = check_gmm_param(w, mu, va)
-
-    # TODO: adjustable level (to do in gauss_ell). 
-    # For now, a level of 0.39 means that we draw
-    # ellipses for 1 standard deviation. 
-    Xe  = []
-    Ye  = []   
-    if mode == 'diag':
-        for i in range(K):
-            xe, ye  = densities.gauss_ell(mu[i,:], va[i,:], dim = c, 
-                    npoints = npoints)
-            Xe.append(xe)
-            Ye.append(ye)
-    elif mode == 'full':
-        for i in range(K):
-            xe, ye  = densities.gauss_ell(mu[i,:], va[i*d:i*d+d,:], dim = c, 
-                    npoints = npoints)
-            Xe.append(xe)
-            Ye.append(ye)
-
-    return Xe, Ye
-
 if __name__ == "__main__":
+    import copy
     #=============================
     # Simple GMM with 5 components
     #=============================
-    import pylab as P
-    k       = 5
-    d       = 5
-    mode    = 'diag'
 
+    #+++++++++++++++++++++++++++++
+    # Meta parameters of the model
+    #   - k: Number of components
+    #   - d: dimension of each Gaussian
+    #   - mode: Mode of covariance matrix: full or diag
+    #   - nframes: number of frames (frame = one data point = one
+    #   row of d elements
+    k       = 3 
+    d       = 2         
+    mode    = 'diag'        
+    nframes = 1e3
+
+    #+++++++++++++++++++++++++++++++++++++++++++
+    # Create an artificial GMM model, samples it
+    #+++++++++++++++++++++++++++++++++++++++++++
     print "Generating the mixture"
     # Generate a model with k components, d dimensions
-    wr, mur, var    = gen_gmm_param(d, k, mode, 3)
-    X               = gen_gmm(wr, mur, var, 1e3)
+    w, mu, va   = GM.gen_param(d, k, mode, spread = 3)
+    gm          = GM(d, k, mode)
+    gm.set_param(w, mu, va)
 
-    print "Init the mixture"
-    # Init the mixture with kmean
-    w0, mu0, va0    = gmm_init_kmean(X, k, mode, niter = 5)
-    
-    # # Use random values instead of kmean
-    # w0  = N.ones(k, float) / k
-    # mu0 = N.randn(k, d)
-    # va0 = N.fabs(N.randn(k, d))
+    # Sample nframes frames  from the model
+    data    = gm.sample(nframes)
 
-    # Copy the initial values because we want to draw them later...
-    w   = w0.copy()
-    mu  = mu0.copy()
-    va  = va0.copy()
+    #++++++++++++++++++++++++
+    # Learn the model with EM
+    #++++++++++++++++++++++++
 
+    # Init the model
+    print "Init a model for learning, with kmean for initialization"
+    lgm = GM(d, k, mode)
+    gmm = GMM(lgm, 'kmean')
+    
+    gmm.init(data)
+    # Keep the initialized model for drawing
+    gm0 = copy.copy(lgm)
+
     # The actual EM, with likelihood computation
     niter   = 10
     like    = N.zeros(niter, float)
 
     print "computing..."
     for i in range(niter):
-        g, tgd  = gmm_posterior(X, w, mu, va)
+        g, tgd  = gmm.sufficient_statistics(data)
         like[i] = N.sum(N.log(N.sum(tgd, 1)))
-        w, mu, va   = gmm_update(X, g, d, k, mode)
+        gmm.update_em(data, g)
+    # # Alternative form, by using EM class: as the EM class
+    # # is quite rudimentary now, it is not very useful, just save
+    # # a few lines
+    # em      = EM()
+    # like    = em.train(data, gmm, niter)
 
+    #+++++++++++++++
+    # Draw the model
+    #+++++++++++++++
     print "drawing..."
-    # Draw what is happening
+    import pylab as P
     P.subplot(2, 1, 1)
-    P.plot(X[:, 0], X[:, 1], '.', label = '_nolegend_')
+    # Draw what is happening
+    P.plot(data[:, 0], data[:, 1], '.', label = '_nolegend_')
 
     # Real confidence ellipses
-    Xre, Yre  = gmm_ellipses(mur, var)
+    Xre, Yre  = gm.conf_ellipses()
     P.plot(Xre[0], Yre[0], 'g', label = 'true confidence ellipsoides')
     for i in range(1,k):
         P.plot(Xre[i], Yre[i], 'g', label = '_nolegend_')
 
     # Initial confidence ellipses as found by kmean
-    X0e, Y0e  = gmm_ellipses(mu0, va0)
+    X0e, Y0e  = gm0.conf_ellipses()
     P.plot(X0e[0], Y0e[0], 'k', label = 'initial confidence ellipsoides')
     for i in range(1,k):
         P.plot(X0e[i], Y0e[i], 'k', label = '_nolegend_')
 
     # Values found by EM
-    Xe, Ye  = gmm_ellipses(mu, va)
+    Xe, Ye  = lgm.conf_ellipses()
     P.plot(Xe[0], Ye[0], 'r', label = 'confidence ellipsoides found by EM')
     for i in range(1,k):
         P.plot(Xe[i], Ye[i], 'r', label = '_nolegend_')
     P.legend(loc = 0)
+
     P.subplot(2, 1, 2)
     P.plot(like)
     P.title('log likelihood')
 
+    # #++++++++++++++++++
     # # Export the figure
+    # #++++++++++++++++++
     # F   = P.gcf()
     # DPI = F.get_dpi()
     # DefaultSize = F.get_size_inches()



More information about the Scipy-svn mailing list