[Scipy-svn] r3099 - in trunk/Lib/sandbox/pyem: . tests

scipy-svn@scip... scipy-svn@scip...
Tue Jun 12 07:21:14 CDT 2007

```Author: cdavid
Date: 2007-06-12 07:21:04 -0500 (Tue, 12 Jun 2007)
New Revision: 3099

Modified:
trunk/Lib/sandbox/pyem/densities.py
trunk/Lib/sandbox/pyem/gauss_mix.py
trunk/Lib/sandbox/pyem/gmm_em.py
trunk/Lib/sandbox/pyem/tests/test_densities.py
trunk/Lib/sandbox/pyem/tests/test_gauss_mix.py
trunk/Lib/sandbox/pyem/tests/test_gmm_em.py
Log:
Add function to compute log responsabilities with logsumexp.

Modified: trunk/Lib/sandbox/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/densities.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/densities.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,7 +1,7 @@
#! /usr/bin/python
#
-# Last Change: Tue Jun 12 12:00 PM 2007 J
+# Last Change: Tue Jun 12 03:00 PM 2007 J
"""This module implements various basic functions related to multivariate
gaussian, such as pdf estimation, confidence interval/ellipsoids, etc..."""

@@ -167,14 +167,6 @@
inva    = lin.inv(va)
fac     = 1 / N.sqrt( (2*N.pi) ** d * N.fabs(lin.det(va)))

-    # # Slow version
-    # n       = N.size(x, 0)
-    # y       = N.zeros(n)
-    # for i in range(n):
-    #     y[i] = N.dot(x[i,:],
-    #              N.dot(inva, N.transpose(x[i,:])))
-    # y *= -0.5
-
# we are using a trick with sum to "emulate"
# the matrix multiplication inva * x without any explicit loop
y   = N.dot((x-mu), inva)

Modified: trunk/Lib/sandbox/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/gauss_mix.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/gauss_mix.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
# /usr/bin/python
-# Last Change: Mon Jun 11 06:00 PM 2007 J
+# Last Change: Tue Jun 12 03:00 PM 2007 J

"""Module implementing GM, a class which represents Gaussian mixtures.

@@ -12,7 +12,7 @@
import numpy as N
from numpy.random import randn, rand
import numpy.linalg as lin
-import densities
+import densities as D
import misc

# Right now, two main usages of a Gaussian Model are possible
@@ -276,13 +276,13 @@
Ye  = []
if self.mode == 'diag':
for i in range(self.k):
-                xe, ye  = densities.gauss_ell(self.mu[i, :], self.va[i, :],
+                xe, ye  = D.gauss_ell(self.mu[i, :], self.va[i, :],
dim, npoints, level)
Xe.append(xe)
Ye.append(ye)
elif self.mode == 'full':
for i in range(self.k):
-                xe, ye  = densities.gauss_ell(self.mu[i, :],
+                xe, ye  = D.gauss_ell(self.mu[i, :],
self.va[i*self.d:i*self.d+self.d, :],
dim, npoints, level)
Xe.append(xe)
@@ -317,6 +317,7 @@
raise GmParamError("Unknown mode")

return True
+
@classmethod
def gen_param(cls, d, nc, varmode = 'diag', spread = 1):
"""Generate random, valid parameters for a gaussian mixture model.
@@ -366,6 +367,27 @@
# def _regularize(self):
#     raise NotImplemented("No regularization")

+    def pdf(self, x, log = False):
+        """Computes the pdf of the model at given points.
+
+        :Parameters:
+            x : ndarray
+                points where to estimate the pdf. One row for one
+                multi-dimensional sample (eg to estimate the pdf at 100
+                different points in 10 dimension, data's shape should be (100,
+                20)).
+            log : bool
+                If true, returns the log pdf instead of the pdf.
+
+        :Returns:
+            y : ndarray
+                the pdf at points x."""
+        if log:
+            return D.logsumexp(N.sum(
+                    D.multiple_gauss_den(x, self.mu, self.va, log = True) * self.w, 1))
+        else:
+            return N.sum(D.multiple_gauss_den(x, self.mu, self.va) * self.w, 1)
+
#=================
# Plotting methods
#=================
@@ -572,7 +594,7 @@
# XXX refactor computing pdf
dmu = self.mu[:, dim]
dva = self._get_va(dim)
-        den = densities.multiple_gauss_den(xdata, dmu, dva) * self.w
+        den = D.multiple_gauss_den(xdata, dmu, dva) * self.w
den = N.sum(den, 1)
den = den.reshape(len(rangey), len(rangex))

Modified: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/gmm_em.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
# /usr/bin/python
-# Last Change: Tue Jun 12 11:00 AM 2007 J
+# Last Change: Tue Jun 12 08:00 PM 2007 J

"""Module implementing GMM, a class to estimate Gaussian mixture models using
EM, and EM, a class which use GMM instances to estimate models parameters using
@@ -159,7 +159,7 @@
knowing the explicit data for the Gaussian model (w, mu, var): gamma(t,
i) = P[state = i | observation = data(t); w, mu, va]

-        This is basically the E step of EM for GMM."""
+        This is basically the E step of EM for finite mixtures."""
# compute the gaussian pdf
tgd	= densities.multiple_gauss_den(data, self.gm.mu, self.gm.va)
# multiply by the weight
@@ -169,6 +169,28 @@

return gd, tgd

+    def compute_log_responsabilities(self, data):
+        """Compute log responsabilities.
+
+        Return normalized and non-normalized responsabilities for the model (in
+        the log domain)
+
+        Note
+        ----
+        Computes the latent variable distribution (a posteriori probability)
+        knowing the explicit data for the Gaussian model (w, mu, var): gamma(t,
+        i) = P[state = i | observation = data(t); w, mu, va]
+
+        This is basically the E step of EM for finite mixtures."""
+        # compute the gaussian pdf
+        tgd	= densities.multiple_gauss_den(data, self.gm.mu, self.gm.va, log = True)
+        # multiply by the weight
+        tgd	+= N.log(self.gm.w)
+        # Normalize to get a pdf
+        gd	= tgd  - densities.logsumexp(tgd)[:, N.newaxis]
+
+        return gd, tgd
+
def update_em(self, data, gamma):
"""Computes update of the Gaussian Mixture Model (M step)
from the a posteriori pdf, computed by gmm_posterior

Modified: trunk/Lib/sandbox/pyem/tests/test_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_densities.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/tests/test_densities.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
#! /usr/bin/env python
-# Last Change: Tue Jun 12 12:00 PM 2007 J
+# Last Change: Tue Jun 12 08:00 PM 2007 J

# TODO:
#   - having "fake tests" to check that all mode (scalar, diag and full) are
@@ -21,7 +21,7 @@
# import modules that are located in the same directory as this file.
restore_path()

-DEF_DEC = 12
+from testcommon import DEF_DEC

class TestDensities(NumpyTestCase):
def _generate_test_data_1d(self):

Modified: trunk/Lib/sandbox/pyem/tests/test_gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_gauss_mix.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/tests/test_gauss_mix.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
#! /usr/bin/env python
-# Last Change: Mon Jun 11 03:00 PM 2007 J
+# Last Change: Tue Jun 12 03:00 PM 2007 J

# For now, just test that all mode/dim execute correctly

@@ -10,6 +10,7 @@

set_package_path()
from pyem import GM
+from pyem.densities import multiple_gauss_den
restore_path()

class test_BasicFunc(NumpyTestCase):
@@ -58,6 +59,16 @@
sva = gm._get_va(dim)
assert N.all(sva == tva)

+    def test_2d_diag_pdf(self):
+        d = 2
+        w = N.array([0.4, 0.6])
+        mu = N.array([[0., 2], [-1, -2]])
+        va = N.array([[1, 0.5], [0.5, 1]])
+        x = N.random.randn(100, 2)
+        gm = GM.fromvalues(w, mu, va)
+        y1 = N.sum(multiple_gauss_den(x, mu, va) * w, 1)
+        y2 = gm.pdf(x)
+        assert_array_almost_equal(y1, y2)

if __name__ == "__main__":
NumpyTest().run()

Modified: trunk/Lib/sandbox/pyem/tests/test_gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/tests/test_gmm_em.py	2007-06-12 04:04:14 UTC (rev 3098)
+++ trunk/Lib/sandbox/pyem/tests/test_gmm_em.py	2007-06-12 12:21:04 UTC (rev 3099)
@@ -1,5 +1,5 @@
#! /usr/bin/env python
-# Last Change: Tue Jun 12 11:00 AM 2007 J
+# Last Change: Tue Jun 12 09:00 PM 2007 J

# For now, just test that all mode/dim execute correctly

@@ -12,6 +12,8 @@
from pyem import GMM, GM, EM
restore_path()

+from testcommon import DEF_DEC
+
dic = loadmat(filename, squeeze_me = False)
@@ -162,5 +164,46 @@
assert_array_equal(gmm.gm.mu, dic['mu'])
assert_array_equal(gmm.gm.va, dic['va'])

+class test_log_domain(EmTest):
+    """This class tests whether the GMM works in log domain."""
+    def _test_common(self, d, k, mode):
+        dic = load_dataset('%s_%dd_%dk.mat' % (mode, d, k))
+
+        gm = GM.fromvalues(dic['w0'], dic['mu0'], dic['va0'])
+        gmm = GMM(gm, 'test')
+
+        a, na = gmm.compute_responsabilities(dic['data'])
+        la, nla = gmm.compute_log_responsabilities(dic['data'])
+
+        ta = N.log(a)
+        tna = N.log(na)
+        if not N.all(N.isfinite(ta)):
+            print "precision problem for %s, %dd, %dk, need fixing" % (mode, d, k)
+        else:
+            assert_array_almost_equal(ta, la, DEF_DEC)
+
+        if not N.all(N.isfinite(tna)):
+            print "precision problem for %s, %dd, %dk, need fixing" % (mode, d, k)
+        else:
+            assert_array_almost_equal(tna, nla, DEF_DEC)
+
+    def test_2d_diag(self, level = 1):
+        d = 2
+        k = 3
+        mode = 'diag'
+        self._test_common(d, k, mode)
+
+    def test_1d_full(self, level = 1):
+        d = 1
+        k = 4
+        mode = 'diag'
+        self._test_common(d, k, mode)
+
+    def test_2d_full(self, level = 1):
+        d = 2
+        k = 3
+        mode = 'full'
+        self._test_common(d, k, mode)
+
if __name__ == "__main__":
NumpyTest().run()

```