[Scipy-svn] r4758 - in trunk/scipy/cluster: . src tests

scipy-svn@scip... scipy-svn@scip...
Tue Sep 30 14:15:04 CDT 2008

Author: damian.eads
Date: 2008-09-30 14:14:59 -0500 (Tue, 30 Sep 2008)
New Revision: 4758

Modified:
trunk/scipy/cluster/distance.py
trunk/scipy/cluster/hierarchy.py
trunk/scipy/cluster/src/distance.c
trunk/scipy/cluster/src/distance.h
trunk/scipy/cluster/src/distance_wrap.c
trunk/scipy/cluster/tests/test_distance.py
Log:
Fixed minor bug in cophenet.

Modified: trunk/scipy/cluster/distance.py
===================================================================
--- trunk/scipy/cluster/distance.py	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/distance.py	2008-09-30 19:14:59 UTC (rev 4758)
@@ -199,6 +199,35 @@
raise ValueError("p must be at least 1")
return (abs(u-v)**p).sum() ** (1.0 / p)

+def wminkowski(u, v, p, w):
+    """
+    Computes the weighted Minkowski distance between two vectors u
+    and v, defined as
+
+    .. math::
+
+       \sum {(w_i*|u_i - v_i|)^p})^(1/p).
+
+    :Parameters:
+       u : ndarray
+           An :math:n-dimensional vector.
+       v : ndarray
+           An :math:n-dimensional vector.
+       p : ndarray
+           The norm of the difference :math:${||u-v||}_p$.
+       w : ndarray
+           The weight vector.
+
+    :Returns:
+       d : double
+           The Minkowski distance between vectors u and v.
+    """
+    u = np.asarray(u)
+    v = np.asarray(v)
+    if p < 1:
+        raise ValueError("p must be at least 1")
+    return ((w * abs(u-v))**p).sum() ** (1.0 / p)
+
def euclidean(u, v):
"""
Computes the Euclidean distance between two n-vectors u and v,
@@ -817,6 +846,14 @@
'jaccard', 'kulsinski', 'mahalanobis', 'matching',
'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'.
+       w : ndarray
+           The weight vector (for weighted Minkowski).
+       p : double
+           The p-norm to apply (for Minkowski, weighted and unweighted)
+       V : ndarray
+           The variance vector (for standardized Euclidean).
+       VI : ndarray
+           The inverse of the covariance matrix (for Mahalanobis).

:Returns:
Y : ndarray
@@ -978,6 +1015,11 @@
Computes the Sokal-Sneath distance between each pair of
boolean vectors. (see sokalsneath function documentation)

+    22. Y = pdist(X, 'wminkowski')
+
+       Computes the weighted Minkowski distance between each pair of
+       vectors. (see wminkowski function documentation)
+
22. Y = pdist(X, f)

Computes the distance between all pairs of vectors in X
@@ -1036,6 +1078,11 @@
for j in xrange(i+1, m):
dm[k] = minkowski(X[i, :], X[j, :], p)
k = k + 1
+        elif metric == wminkowski:
+            for i in xrange(0, m - 1):
+                for j in xrange(i+1, m):
+                    dm[k] = wminkowski(X[i, :], X[j, :], p, w)
+                    k = k + 1
elif metric == seuclidean:
for i in xrange(0, m - 1):
for j in xrange(i+1, m):
@@ -1079,6 +1126,9 @@
_distance_wrap.pdist_chebyshev_wrap(_convert_to_double(X), dm)
elif mstr in set(['minkowski', 'mi', 'm']):
_distance_wrap.pdist_minkowski_wrap(_convert_to_double(X), dm, p)
+        elif mstr in set(['wminkowski', 'wmi', 'wm', 'wpnorm']):
+            _distance_wrap.cdist_weighted_minkowski_wrap(_convert_to_double(X),
+                                                         dm, p, w)
elif mstr in set(['seuclidean', 'se', 's']):
if V is not None:
V = np.asarray(V)
@@ -1174,7 +1224,9 @@
elif metric == 'test_cityblock':
dm = pdist(X, cityblock)
elif metric == 'test_minkowski':
-            dm = pdist(X, minkowski, p)
+            dm = pdist(X, minkowski, p=p)
+        elif metric == 'test_wminkowski':
+            dm = pdist(X, wminkowski, p=p, w=w)
elif metric == 'test_cosine':
dm = pdist(X, cosine)
elif metric == 'test_correlation':
@@ -1502,7 +1554,7 @@
return d

-def cdist(XA, XB, metric='euclidean', p=2, V=None, VI=None):
+def cdist(XA, XB, metric='euclidean', p=2, V=None, VI=None, w=None):
"""
Computes distance between each pair of observations between two
collections of vectors. XA is a :math:$m_A$ by :math:$n$
@@ -1529,8 +1581,18 @@
'correlation', 'cosine', 'dice', 'euclidean', 'hamming',
'jaccard', 'kulsinski', 'mahalanobis', 'matching',
'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
-           'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'.
+           'sokalmichener', 'sokalsneath', 'sqeuclidean', 'wminkowski',
+           'yule'.
+       w : ndarray
+           The weight vector (for weighted Minkowski).
+       p : double
+           The p-norm to apply (for Minkowski, weighted and unweighted)
+       V : ndarray
+           The variance vector (for standardized Euclidean).
+       VI : ndarray
+           The inverse of the covariance matrix (for Mahalanobis).

+
:Returns:
Y : ndarray
A :math:$m_A$ by :math:$m_B$ distance matrix.
@@ -1688,11 +1750,17 @@

21. Y = cdist(X, 'sokalsneath')

-       Computes the Sokal-Sneath distance between each pair of
-       boolean vectors. (see sokalsneath function documentation)
+       Computes the Sokal-Sneath distance between the vectors. (see
+       sokalsneath function documentation)

-    22. Y = cdist(X, f)

+    22. Y = cdist(X, 'wminkowski')
+
+       Computes the weighted Minkowski distance between the
+       vectors. (see sokalsneath function documentation)
+
+    23. Y = cdist(X, f)
+
Computes the distance between all pairs of vectors in X
using the user supplied 2-arity function f. For example,
Euclidean distance between the vectors could be computed
@@ -1755,6 +1823,10 @@
for i in xrange(0, mA):
for j in xrange(0, mB):
dm[i, j] = minkowski(XA[i, :], XB[j, :], p)
+        elif metric == wminkowski:
+            for i in xrange(0, mA):
+                for j in xrange(0, mB):
+                    dm[i, j] = wminkowski(XA[i, :], XB[j, :], p, w)
elif metric == seuclidean:
for i in xrange(0, mA):
for j in xrange(0, mB):
@@ -1800,9 +1872,12 @@
elif mstr in set(['chebychev', 'chebyshev', 'cheby', 'cheb', 'ch']):
_distance_wrap.cdist_chebyshev_wrap(_convert_to_double(XA),
_convert_to_double(XB), dm)
-        elif mstr in set(['minkowski', 'mi', 'm']):
+        elif mstr in set(['minkowski', 'mi', 'm', 'pnorm']):
_distance_wrap.cdist_minkowski_wrap(_convert_to_double(XA),
_convert_to_double(XB), dm, p)
+        elif mstr in set(['wminkowski', 'wmi', 'wm', 'wpnorm']):
+            _distance_wrap.cdist_weighted_minkowski_wrap(_convert_to_double(XA),
+                                                         _convert_to_double(XB), dm, p, _convert_to_double(w))
elif mstr in set(['seuclidean', 'se', 's']):
if V is not None:
V = np.asarray(V)
@@ -1921,7 +1996,9 @@
elif metric == 'test_cityblock':
dm = cdist(XA, XB, cityblock)
elif metric == 'test_minkowski':
-            dm = cdist(XA, XB, minkowski, p)
+            dm = cdist(XA, XB, minkowski, p=p)
+        elif metric == 'test_wminkowski':
+            dm = cdist(XA, XB, wminkowski, p=p, w=w)
elif metric == 'test_cosine':
dm = cdist(XA, XB, cosine)
elif metric == 'test_correlation':

Modified: trunk/scipy/cluster/hierarchy.py
===================================================================
--- trunk/scipy/cluster/hierarchy.py	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/hierarchy.py	2008-09-30 19:14:59 UTC (rev 4758)
@@ -910,14 +910,13 @@
correlation coefficient and d is the condensed cophentic
distance matrix (upper triangular form).
"""
-    Z = np.asarray(Z)
-
nargs = len(args)

if nargs < 1:
raise ValueError('At least one argument must be passed to cophenet.')

Z = args[0]
+    Z = np.asarray(Z)
Zs = Z.shape
n = Zs[0] + 1
@@ -932,6 +931,7 @@
return zz

Y = args[1]
+    Y = np.asarray(Y)
Ys = Y.shape
distance.is_valid_y(Y, throw=True, name='Y')

Modified: trunk/scipy/cluster/src/distance.c
===================================================================
--- trunk/scipy/cluster/src/distance.c	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/src/distance.c	2008-09-30 19:14:59 UTC (rev 4758)
@@ -294,6 +294,16 @@
return pow(s, 1.0 / p);
}

+double weighted_minkowski_distance(const double *u, const double *v, int n, double p, const double *w) {
+  int i = 0;
+  double s = 0.0, d;
+  for (i = 0; i < n; i++) {
+    d = fabs(u[i] - v[i]) * w[i];
+    s = s + pow(d, p);
+  }
+  return pow(s, 1.0 / p);
+}
+
void compute_mean_vector(double *res, const double *X, int m, int n) {
int i, j;
const double *v;
@@ -489,6 +499,19 @@
}
}

+void pdist_weighted_minkowski(const double *X, double *dm, int m, int n, double p, const double *w) {
+  int i, j;
+  const double *u, *v;
+  double *it = dm;
+  for (i = 0; i < m; i++) {
+    for (j = i + 1; j < m; j++, it++) {
+      u = X + (n * i);
+      v = X + (n * j);
+      *it = weighted_minkowski_distance(u, v, n, p, w);
+    }
+  }
+}
+
void pdist_yule_bool(const char *X, double *dm, int m, int n) {
int i, j;
const char *u, *v;
@@ -813,6 +836,19 @@
}
}

+void cdist_weighted_minkowski(const double *XA, const double *XB, double *dm, int mA, int mB, int n, double p, const double *w) {
+  int i, j;
+  const double *u, *v;
+  double *it = dm;
+  for (i = 0; i < mA; i++) {
+    for (j = 0; j < mB; j++, it++) {
+      u = XA + (n * i);
+      v = XB + (n * j);
+      *it = weighted_minkowski_distance(u, v, n, p, w);
+    }
+  }
+}
+
void cdist_yule_bool(const char *XA, const char *XB, double *dm, int mA, int mB, int n) {
int i, j;
const char *u, *v;

Modified: trunk/scipy/cluster/src/distance.h
===================================================================
--- trunk/scipy/cluster/src/distance.h	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/src/distance.h	2008-09-30 19:14:59 UTC (rev 4758)
@@ -55,6 +55,7 @@
void pdist_jaccard_bool(const char *X, double *dm, int m, int n);
void pdist_kulsinski_bool(const char *X, double *dm, int m, int n);
void pdist_minkowski(const double *X, double *dm, int m, int n, double p);
+void pdist_weighted_minkowski(const double *X, double *dm, int m, int n, double p, const double *w);
void pdist_yule_bool(const char *X, double *dm, int m, int n);
void pdist_matching_bool(const char *X, double *dm, int m, int n);
void pdist_dice_bool(const char *X, double *dm, int m, int n);
@@ -93,6 +94,8 @@
int mA, int mB, int n);
void cdist_minkowski(const double *XA, const double *XB, double *dm,
int mA, int mB, int n, double p);
+void cdist_weighted_minkowski(const double *XA, const double *XB, double *dm,
+			      int mA, int mB, int n, double p, const double *w);
void cdist_yule_bool(const char *XA, const char *XB, double *dm,
int mA, int mB, int n);
void cdist_matching_bool(const char *XA, const char *XB, double *dm,

Modified: trunk/scipy/cluster/src/distance_wrap.c
===================================================================
--- trunk/scipy/cluster/src/distance_wrap.c	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/src/distance_wrap.c	2008-09-30 19:14:59 UTC (rev 4758)
@@ -347,12 +347,36 @@
mA = XA_->dimensions[0];
mB = XB_->dimensions[0];
n = XA_->dimensions[1];
-
cdist_minkowski(XA, XB, dm, mA, mB, n, p);
}
return Py_BuildValue("d", 0.0);
}

+extern PyObject *cdist_weighted_minkowski_wrap(PyObject *self, PyObject *args) {
+  PyArrayObject *XA_, *XB_, *dm_, *w_;
+  int mA, mB, n;
+  double *dm;
+  const double *XA, *XB, *w;
+  double p;
+  if (!PyArg_ParseTuple(args, "O!O!O!dO!",
+			&PyArray_Type, &XA_, &PyArray_Type, &XB_,
+			&PyArray_Type, &dm_,
+			&p,
+			&PyArray_Type, &w_)) {
+    return 0;
+  }
+  else {
+    XA = (const double*)XA_->data;
+    XB = (const double*)XB_->data;
+    w = (const double*)w_->data;
+    dm = (double*)dm_->data;
+    mA = XA_->dimensions[0];
+    mB = XB_->dimensions[0];
+    n = XA_->dimensions[1];
+    cdist_weighted_minkowski(XA, XB, dm, mA, mB, n, p, w);
+  }
+  return Py_BuildValue("d", 0.0);
+}

extern PyObject *cdist_yule_bool_wrap(PyObject *self, PyObject *args) {
PyArrayObject *XA_, *XB_, *dm_;
@@ -824,7 +848,31 @@
return Py_BuildValue("d", 0.0);
}

+extern PyObject *pdist_weighted_minkowski_wrap(PyObject *self, PyObject *args) {
+  PyArrayObject *X_, *dm_, *w_;
+  int m, n;
+  double *dm, *X, *w;
+  double p;
+  if (!PyArg_ParseTuple(args, "O!O!dO!",
+			&PyArray_Type, &X_,
+			&PyArray_Type, &dm_,
+			&p,
+			&PyArray_Type, &w_)) {
+    return 0;
+  }
+  else {
+    X = (double*)X_->data;
+    dm = (double*)dm_->data;
+    w = (const double*)w_->data;
+    m = X_->dimensions[0];
+    n = X_->dimensions[1];

+    pdist_weighted_minkowski(X, dm, m, n, p, w);
+  }
+  return Py_BuildValue("d", 0.0);
+}
+
+
extern PyObject *pdist_yule_bool_wrap(PyObject *self, PyObject *args) {
PyArrayObject *X_, *dm_;
int m, n;
@@ -1048,6 +1096,7 @@
{"cdist_mahalanobis_wrap", cdist_mahalanobis_wrap, METH_VARARGS},
{"cdist_matching_bool_wrap", cdist_matching_bool_wrap, METH_VARARGS},
{"cdist_minkowski_wrap", cdist_minkowski_wrap, METH_VARARGS},
+  {"cdist_weighted_minkowski_wrap", cdist_weighted_minkowski_wrap, METH_VARARGS},
{"cdist_rogerstanimoto_bool_wrap", cdist_rogerstanimoto_bool_wrap, METH_VARARGS},
{"cdist_russellrao_bool_wrap", cdist_russellrao_bool_wrap, METH_VARARGS},
{"cdist_seuclidean_wrap", cdist_seuclidean_wrap, METH_VARARGS},
@@ -1069,6 +1118,7 @@
{"pdist_mahalanobis_wrap", pdist_mahalanobis_wrap, METH_VARARGS},
{"pdist_matching_bool_wrap", pdist_matching_bool_wrap, METH_VARARGS},
{"pdist_minkowski_wrap", pdist_minkowski_wrap, METH_VARARGS},
+  {"pdist_weighted_minkowski_wrap", pdist_weighted_minkowski_wrap, METH_VARARGS},
{"pdist_rogerstanimoto_bool_wrap", pdist_rogerstanimoto_bool_wrap, METH_VARARGS},
{"pdist_russellrao_bool_wrap", pdist_russellrao_bool_wrap, METH_VARARGS},
{"pdist_seuclidean_wrap", pdist_seuclidean_wrap, METH_VARARGS},

Modified: trunk/scipy/cluster/tests/test_distance.py
===================================================================
--- trunk/scipy/cluster/tests/test_distance.py	2008-09-30 07:00:07 UTC (rev 4757)
+++ trunk/scipy/cluster/tests/test_distance.py	2008-09-30 19:14:59 UTC (rev 4758)
@@ -233,6 +233,44 @@
print (Y1-Y2).max()
self.failUnless(within_tol(Y1, Y2, eps))

+
+    def test_cdist_wminkowski_random_p3d8(self):
+        "Tests cdist(X, 'wminkowski') on random data. (p=3.8)"
+        eps = 1e-07
+        # Get the data: the input matrix and the right output.
+        X1 = eo['cdist-X1']
+        X2 = eo['cdist-X2']
+        w = 1.0 / X1.std(axis=0)
+        Y1 = cdist(X1, X2, 'wminkowski', p=3.8, w=w)
+        Y2 = cdist(X1, X2, 'test_wminkowski', p=3.8, w=w)
+        print (Y1-Y2).max()
+        self.failUnless(within_tol(Y1, Y2, eps))
+
+    def test_cdist_wminkowski_random_p4d6(self):
+        "Tests cdist(X, 'wminkowski') on random data. (p=4.6)"
+        eps = 1e-07
+        # Get the data: the input matrix and the right output.
+        X1 = eo['cdist-X1']
+        X2 = eo['cdist-X2']
+        w = 1.0 / X1.std(axis=0)
+        Y1 = cdist(X1, X2, 'wminkowski', p=4.6, w=w)
+        Y2 = cdist(X1, X2, 'test_wminkowski', p=4.6, w=w)
+        print (Y1-Y2).max()
+        self.failUnless(within_tol(Y1, Y2, eps))
+
+    def test_cdist_wminkowski_random_p1d23(self):
+        "Tests cdist(X, 'wminkowski') on random data. (p=1.23)"
+        eps = 1e-07
+        # Get the data: the input matrix and the right output.
+        X1 = eo['cdist-X1']
+        X2 = eo['cdist-X2']
+        w = 1.0 / X1.std(axis=0)
+        Y1 = cdist(X1, X2, 'wminkowski', p=1.23, w=w)
+        Y2 = cdist(X1, X2, 'test_wminkowski', p=1.23, w=w)
+        print (Y1-Y2).max()
+        self.failUnless(within_tol(Y1, Y2, eps))
+
+
def test_cdist_seuclidean_random(self):
"Tests cdist(X, 'seuclidean') on random data."
eps = 1e-07