[Scipy-svn] r2268 - in trunk/Lib/sandbox/pyem: . pyem pyem/src

scipy-svn at scipy.org scipy-svn at scipy.org
Thu Oct 12 08:45:39 CDT 2006


Author: cdavid
Date: 2006-10-12 08:45:25 -0500 (Thu, 12 Oct 2006)
New Revision: 2268

Added:
   trunk/Lib/sandbox/pyem/.bzrignore
   trunk/Lib/sandbox/pyem/README
   trunk/Lib/sandbox/pyem/pyem/profile_densities.py
   trunk/Lib/sandbox/pyem/setup.py
Modified:
   trunk/Lib/sandbox/pyem/TODO
   trunk/Lib/sandbox/pyem/pyem/densities.py
   trunk/Lib/sandbox/pyem/pyem/gmm_em.py
   trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
Log:
[pyem @ david at ar.media.kyoto-u.ac.jp-20060713062008-b708a11f99096bc8]
Package uses distutil
David Cournapeau <david at ar.media.kyoto-u.ac.jp> | 2006-07-13 15:20:08 +0900 (Thu, 13 Jul 2006)

Added: trunk/Lib/sandbox/pyem/.bzrignore
===================================================================
--- trunk/Lib/sandbox/pyem/.bzrignore	2006-10-12 13:45:14 UTC (rev 2267)
+++ trunk/Lib/sandbox/pyem/.bzrignore	2006-10-12 13:45:25 UTC (rev 2268)
@@ -0,0 +1,7 @@
+dist
+pyem/src/c_gmm.c
+MANIFEST
+build
+pyem/bench1prof
+pyem/diag.dat
+pyem/gdenprof

Added: trunk/Lib/sandbox/pyem/README
===================================================================
--- trunk/Lib/sandbox/pyem/README	2006-10-12 13:45:14 UTC (rev 2267)
+++ trunk/Lib/sandbox/pyem/README	2006-10-12 13:45:25 UTC (rev 2268)
@@ -0,0 +1,12 @@
+Last Change: Wed Jul 12 04:00 PM 2006 J
+
+pyem is a python module build upon numpy and scipy
+(see http://www.scipy.org/) for learning mixtures models
+using Expectation Maximization. For now, only Gaussian
+Mixture Models are implemented. Included features:
+    - computation of Gaussian pdf for multi-variate Gaussian
+    random vectors (spherical, diagonal and full covariance matrices)
+    - Sampling of Gaussian Mixtures Models
+    - Confidence ellipsoides with probability 0.5
+    - Classic EM for Gaussian Mixture Models
+    - K-mean based initialization

Modified: trunk/Lib/sandbox/pyem/TODO
===================================================================
--- trunk/Lib/sandbox/pyem/TODO	2006-10-12 13:45:14 UTC (rev 2267)
+++ trunk/Lib/sandbox/pyem/TODO	2006-10-12 13:45:25 UTC (rev 2268)
@@ -1,7 +1,16 @@
-# Last Change: Fri Jun 30 07:00 PM 2006 J
+# Last Change: Wed Jul 12 05:00 PM 2006 J
 
-# In more urgent order:
+Things which must be implemented for a 1.0 version:
+    - test for various length and model size.
+    - refactoring with a class modelling mixture models.
+    - for Gaussian densities: compute level <-> confidence ellipses 
+    relationship with Chi2 model.
+    - C implementation of Gaussian densities at least.
+    - a small help/tutorial
 
-1: test and benchmarks for various length and model size
-2: refactoring for Mixture models, having online mode, etc...
-3: for gaussian densities: compute level <-> confidence ellipses relationship with Chi2 model
+Things which would be nice:
+    - Integrate libem (libem should be modified so
+    that it would be easy to package and distribute)
+    - On-line EM
+    - Other initialization schemes
+    - Other mixture models

Modified: trunk/Lib/sandbox/pyem/pyem/densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/densities.py	2006-10-12 13:45:14 UTC (rev 2267)
+++ trunk/Lib/sandbox/pyem/pyem/densities.py	2006-10-12 13:45:25 UTC (rev 2268)
@@ -1,7 +1,7 @@
 #! /usr/bin/python
 #
 # Copyrighted David Cournapeau
-# Last Change: Fri Jun 30 06:00 PM 2006 J
+# Last Change: Thu Jul 13 03:00 PM 2006 J
 
 import numpy as N
 import numpy.linalg as lin
@@ -211,6 +211,30 @@
 
     return elps[0, :], elps[1, :]
 
+def generate_test_data(n, d, mode = 'diag', file='test.dat'):
+    """Generate a set of data of dimension d, with n frames,
+    that is input data, mean, var and output of gden, so that
+    other implementations can be tested against"""
+    mu  = N.randn(1, d)
+    if mode == 'diag':
+        va  = abs(N.randn(1, d))
+    elif mode == 'full':
+        va  = N.randn(d, d)
+        va  = N.matrixmultiply(va, va.transpose())
+
+    input   = N.randn(n, d)
+    output  = gauss_den(input, mu, va)
+
+    import tables
+    h5file  = tables.openFile(file, "w")
+
+    h5file.createArray(h5file.root, 'input', input)
+    h5file.createArray(h5file.root, 'mu', mu)
+    h5file.createArray(h5file.root, 'va', va)
+    h5file.createArray(h5file.root, 'output', output)
+
+    h5file.close()
+
 def test_gauss_den():
     """"""
     # import tables

Modified: trunk/Lib/sandbox/pyem/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/gmm_em.py	2006-10-12 13:45:14 UTC (rev 2267)
+++ trunk/Lib/sandbox/pyem/pyem/gmm_em.py	2006-10-12 13:45:25 UTC (rev 2268)
@@ -1,15 +1,6 @@
 # /usr/bin/python
-# Last Change: Fri Jun 30 07:00 PM 2006 J
+# Last Change: Thu Jul 13 02:00 PM 2006 J
 
-# TODO:
-#   - interface with libem
-#   - implements E and M in batch mode for mixtures, with special case for gmm 
-#   ( general EM should be outside this module. This one should be GMM specific, maybe)
-#   - implements E and M in online mode
-#   - python has list. Does it make sense to keep the h2m format for mean, va, when
-#   we can use list instead ?
-#   - a Gaussian Mixture Model should be a class, really !
-
 import numpy as N
 import numpy.linalg as lin
 import densities
@@ -440,7 +431,6 @@
 
     return Xe, Ye
 
-import c_gmm
 def kmean(data, init, iter = 10):
     """Simple kmean implementation for EM
     
@@ -462,14 +452,14 @@
     for i in range(iter):
         # Compute the nearest neighbour for each obs
         # using the current code book
-        label   = c_gmm._vq(data, code)
+        label   = _vq(data, code)
         # Update the code by computing centroids using the new code book
         for j in range(k):
             code[j,:] = N.mean(data[N.where(label==j)]) 
 
     return code, label
 
-def _vq(data, code):
+def _py_vq(data, code):
     """ Please do not use directly. Use kmean instead"""
     # No attempt to be efficient has been made...
     (n, d)  = data.shape
@@ -482,6 +472,15 @@
 
     return label
     
+# Try to import pyrex function for vector quantization. If not available,
+# falls back on pure python implementation.
+try:
+    from c_gmm import _vq
+except:
+    print """c_gmm._vq not found, using pure python implementation instead.
+        Kmean will be REALLY slow"""
+    _vq = _py_vq
+
 # Test functions usable for now
 def test_kmean():
     X   = N.array([[3.0, 3], [4, 3], [4, 2],
@@ -671,18 +670,13 @@
         print "update"
         w, mu, va   = gmm_update(X, g, d, k, mode)
 
-
-def benchmark():
-    import profile
-    profile.run('_bench1()', 'bench1prof')
-
 if __name__ == "__main__":
     #=============================
     # Simple GMM with 3 components
     #=============================
     import pylab as P
-    k       = 4
-    d       = 2
+    k       = 5
+    d       = 3
     mode    = 'diag'
 
     print "Generating the mixture"

Added: trunk/Lib/sandbox/pyem/pyem/profile_densities.py
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/profile_densities.py	2006-10-12 13:45:14 UTC (rev 2267)
+++ trunk/Lib/sandbox/pyem/pyem/profile_densities.py	2006-10-12 13:45:25 UTC (rev 2268)
@@ -0,0 +1,45 @@
+import numpy as N
+import densities as D
+import tables
+
+def bench1(mode = 'diag'):
+    #===========================================
+    # Diag Gaussian of dimension 20
+    #===========================================
+    d       = 20
+    n       = 1e5
+    niter   = 1
+    mode    = 'diag'
+
+    # Generate a model with k components, d dimensions
+    mu  = N.randn(1, d)
+    if mode == 'diag':
+        va  = abs(N.randn(1, d))
+    elif mode == 'full':
+        va  = N.randn(d, d)
+        va  = N.matrixmultiply(va, va.transpose())
+
+    X   = N.randn(n, d)
+    print "Compute %d times densities, %d dimension, %d frames" % (niter, d, n)
+    for i in range(niter):
+        Y   = D.gauss_den(X, mu, va)
+    
+    # Check values
+    h5file  = tables.openFile('diag.dat', "r")
+    X   = h5file.root.input.read()
+    mu  = h5file.root.mu.read()
+    va  = h5file.root.va.read()
+    Yt  = h5file.root.output.read()
+    Y   = D.gauss_den(X, mu, va)
+    try:
+        N.testing.assert_equal(Y, Yt) 
+    except:
+        raise "Not accurate !!!!"
+
+if __name__ == "__main__":
+    import profile
+    profile.run('bench1()', 'gdenprof')
+    import pstats
+    p = pstats.Stats('gdenprof')
+    print p.sort_stats('cumulative').print_stats(20)
+

Modified: trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx
===================================================================
--- trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx	2006-10-12 13:45:14 UTC (rev 2267)
+++ trunk/Lib/sandbox/pyem/pyem/src/c_gmm.pyx	2006-10-12 13:45:25 UTC (rev 2268)
@@ -1,4 +1,4 @@
-# Last Change: Tue May 16 11:00 AM 2006 J
+# Last Change: Thu Jul 13 02:00 PM 2006 J
 
 cimport c_numpy
 cimport c_python
@@ -7,7 +7,7 @@
 c_numpy.import_array()
 
 # pyrex version of _vq. Much faster in high dimension/high number of K 
-# (ie more than 3-4)
+# (ie K > 3-4)
 def _vq(data, init):
     (n, d)  = data.shape
     label   = numpy.zeros(n, int)
@@ -64,3 +64,35 @@
         labeld[i]   = lab
 
     return lab
+
+# # Scalar gaussian a posteriori pdf
+# def _scalar_gauss_den(c_numpy.ndarray x, double mu, 
+#         double va, int log):
+#     """ This function is the actual implementation
+#     of gaussian pdf in scalar case. It assumes all args
+#     are conformant, so it should not be used directly
+#     
+#     ** Expect centered data (ie with mean removed) **
+# 
+#     Call gauss_den instead"""
+#     cdef int d, n
+#     cdef double inva, fac
+#     cdef double* data
+# 
+#     if not x.dtype == numpy.dtype(numpy.float64):
+#         print '_scalar_gauss_den not (yet) implemented for dtype %s'%dtype.name
+#         return
+#     data  = (<double*>x.data)
+# 
+#     d       = x.dimensions[1]
+#     n       = x.dimensions[0]
+#     inva    = 1/va
+#     fac     = (2*N.pi) ** (-d/2.0) * N.sqrt(inva)
+#     y       = (x ** 2) * -0.5 * inva
+#     if not log:
+#         y   = fac * N.exp(y)
+#     else:
+#         y   = y + log(fac)
+# 
+#     return y
+#     

Added: trunk/Lib/sandbox/pyem/setup.py
===================================================================
--- trunk/Lib/sandbox/pyem/setup.py	2006-10-12 13:45:14 UTC (rev 2267)
+++ trunk/Lib/sandbox/pyem/setup.py	2006-10-12 13:45:25 UTC (rev 2268)
@@ -0,0 +1,59 @@
+#! /usr/bin/env python
+# Last Change: Thu Jul 13 01:00 PM 2006 J
+""" pyem is a small python package to estimate Gaussian Mixtures Models
+from data, using Expectation Maximization"""
+from distutils.core import setup, Extension
+from pyem import version as pyem_version
+
+# distutils does not update MANIFEST correctly, removes it
+import os
+if os.path.exists('MANIFEST'): os.remove('MANIFEST')
+
+from numpy.distutils.misc_util import get_numpy_include_dirs
+NUMPYINC    = get_numpy_include_dirs()[0]
+print NUMPYINC
+
+# General variables:
+#   - DISTNAME: name of the distributed package
+#   - VERSION: the version reference is in pyem/__init__.py file
+#   - other upper cased variables are the same than the corresponding 
+#   keywords in setup call
+DISTNAME    = 'pyem' 
+VERSION     = pyem_version
+DESCRIPTION ='A python module for Expectation Maximization learning of mixtures pdf',
+AUTHOR      ='David Cournapeau',
+AUTHOR_EMAIL='david at ar.media.kyoto-u.ac.jp',
+URL         ='http://ar.media.kyoto-u.ac.jp/members/david',
+
+def setup_pyrex():
+    setup(name=DISTNAME,
+        version=VERSION,
+        description=DESCRIPTION,
+        author=AUTHOR,
+        author_email=AUTHOR_EMAIL,
+        url=URL,
+        packages=['pyem'], 
+        ext_modules=[Extension('pyem/c_gmm', ['pyem/src/c_gmm.pyx'], 
+            include_dirs=[NUMPYINC])],
+        data_files=['c_numpy.pxd', 'c_python.pxd'],
+        cmdclass = {'build_ext': build_ext},
+    )
+
+def setup_nopyrex():
+    setup(name=DISTNAME,
+        version=VERSION,
+        description=DESCRIPTION,
+        author=AUTHOR,
+        author_email=AUTHOR_EMAIL,
+        url=URL,
+        packages=['pyem'], 
+        #ext_modules=[Extension('_hello', ['hellomodule.c'])],
+    )
+
+try:
+    from Pyrex.Distutils import build_ext
+    setup_pyrex()
+except:
+    print "Pyrex not found, C extension won't be available"
+    setup_nopyrex()
+



More information about the Scipy-svn mailing list