# [Scipy-svn] r6947 - in trunk/scipy/spatial: . tests

scipy-svn@scip... scipy-svn@scip...
Sat Nov 27 01:14:48 CST 2010

```Author: warren.weckesser
Date: 2010-11-27 01:14:47 -0600 (Sat, 27 Nov 2010)
New Revision: 6947

Modified:
trunk/scipy/spatial/distance.py
trunk/scipy/spatial/tests/test_distance.py
Log:
BUG: spatial: (ticket #1146) pdist didn't handle the weighted minkowski function correctly (the 'w' argument was not defined); also fixed and simplified other code.  Thanks to Teemu Ikonen for some of the patches.

Modified: trunk/scipy/spatial/distance.py
===================================================================
--- trunk/scipy/spatial/distance.py	2010-11-27 03:36:50 UTC (rev 6946)
+++ trunk/scipy/spatial/distance.py	2010-11-27 07:14:47 UTC (rev 6947)
@@ -110,10 +110,12 @@

"""

+import warnings
import numpy as np
+
import _distance_wrap
-import types

+
def _copy_array_if_base_present(a):
"""
Copies the array if its base points to a parent array.
@@ -121,7 +123,7 @@
if a.base is not None:
return a.copy()
elif np.issubsctype(a, np.float32):
-        return array(a, dtype=np.double)
+        return np.array(a, dtype=np.double)
else:
return a

@@ -201,6 +203,7 @@
"""
u = np.asarray(u, order='c')
v = np.asarray(v, order='c')
+    w = np.asarray(w)
if p < 1:
raise ValueError("p must be at least 1")
return ((w * abs(u-v))**p).sum() ** (1.0 / p)
@@ -570,10 +573,8 @@
if u.dtype == np.int or u.dtype == np.float_ or u.dtype == np.double:
not_u = 1.0 - u
not_v = 1.0 - v
-        nff = (not_u * not_v).sum()
nft = (not_u * v).sum()
ntf = (u * not_v).sum()
-        ntt = (u * v).sum()
else:
not_u = ~u
not_v = ~v
@@ -806,7 +807,7 @@
return float(2.0 * (ntf + nft)) / denom

-def pdist(X, metric='euclidean', p=2, V=None, VI=None):
+def pdist(X, metric='euclidean', p=2, w=None, V=None, VI=None):
r"""
Computes the pairwise distances between m original observations in
n-dimensional space. Returns a condensed distance matrix Y.  For
@@ -1038,50 +1039,39 @@

X = np.asarray(X, order='c')

-    #if np.issubsctype(X, np.floating) and not np.issubsctype(X, np.double):
-    #    raise TypeError('Floating point arrays must be 64-bit (got %r).' %
-    #    (X.dtype.type,))
-
# The C code doesn't do striding.
[X] = _copy_arrays_if_base_present([_convert_to_double(X)])

s = X.shape
-
if len(s) != 2:
raise ValueError('A 2-dimensional array must be passed.');

-    m = s[0]
-    n = s[1]
+    m, n = s
dm = np.zeros((m * (m - 1) / 2,), dtype=np.double)

+    wmink_names = ['wminkowski', 'wmi', 'wm', 'wpnorm']
+    if w is None and (metric == wminkowski or metric in wmink_names):
+        raise ValueError('weighted minkowski requires a weight '
+                            'vector `w` to be given.')
+
if callable(metric):
-        k = 0
if metric == minkowski:
-            for i in xrange(0, m - 1):
-                for j in xrange(i+1, m):
-                    dm[k] = minkowski(X[i, :], X[j, :], p)
-                    k = k + 1
+            def dfun(u,v): return minkowski(u, v, p)
elif metric == wminkowski:
-            for i in xrange(0, m - 1):
-                for j in xrange(i+1, m):
-                    dm[k] = wminkowski(X[i, :], X[j, :], p, w)
-                    k = k + 1
+            def dfun(u,v): return wminkowski(u, v, p, w)
elif metric == seuclidean:
-            for i in xrange(0, m - 1):
-                for j in xrange(i+1, m):
-                    dm[k] = seuclidean(X[i, :], X[j, :], V)
-                    k = k + 1
+            def dfun(u,v): return seuclidean(u, v, V)
elif metric == mahalanobis:
-            for i in xrange(0, m - 1):
-                for j in xrange(i+1, m):
-                    dm[k] = mahalanobis(X[i, :], X[j, :], V)
-                    k = k + 1
+            def dfun(u,v): return mahalanobis(u, v, V)
else:
-            for i in xrange(0, m - 1):
-                for j in xrange(i+1, m):
-                    dm[k] = metric(X[i, :], X[j, :])
-                    k = k + 1
+            dfun = metric

+        k = 0
+        for i in xrange(0, m - 1):
+            for j in xrange(i+1, m):
+                dm[k] = dfun(X[i], X[j])
+                k = k + 1
+
elif isinstance(metric,basestring):
mstr = metric.lower()

@@ -1109,9 +1099,9 @@
_distance_wrap.pdist_chebyshev_wrap(_convert_to_double(X), dm)
elif mstr in set(['minkowski', 'mi', 'm']):
_distance_wrap.pdist_minkowski_wrap(_convert_to_double(X), dm, p)
-        elif mstr in set(['wminkowski', 'wmi', 'wm', 'wpnorm']):
-            _distance_wrap.cdist_weighted_minkowski_wrap(_convert_to_double(X),
-                                                         dm, p, w)
+        elif mstr in wmink_names:
+            _distance_wrap.pdist_weighted_minkowski_wrap(_convert_to_double(X),
+                                                         dm, p, np.asarray(w))
elif mstr in set(['seuclidean', 'se', 's']):
if V is not None:
V = np.asarray(V, order='c')
@@ -1122,7 +1112,9 @@
if len(V.shape) != 1:
raise ValueError('Variance vector V must be one-dimensional.')
if V.shape[0] != n:
-                    raise ValueError('Variance vector V must be of the same dimension as the vectors on which the distances are computed.')
+                    raise ValueError('Variance vector V must be of the same '
+                            'dimension as the vectors on which the distances '
+                            'are computed.')
# The C code doesn't do striding.
[VV] = _copy_arrays_if_base_present([_convert_to_double(V)])
else:
@@ -1432,7 +1424,7 @@
if throw:
raise
if warning:
-            _warning(str(e))
+            warnings.warn(str(e))
valid = False
return valid

@@ -1492,7 +1484,7 @@
if throw:
raise
if warning:
-            _warning(str(e))
+            warnings.warn(str(e))
valid = False
return valid

Modified: trunk/scipy/spatial/tests/test_distance.py
===================================================================
--- trunk/scipy/spatial/tests/test_distance.py	2010-11-27 03:36:50 UTC (rev 6946)
+++ trunk/scipy/spatial/tests/test_distance.py	2010-11-27 07:14:47 UTC (rev 6947)
@@ -38,11 +38,11 @@

import numpy as np
from numpy.testing import verbose, TestCase, run_module_suite, \
-        assert_raises
+        assert_raises, assert_array_equal
from scipy.spatial.distance import squareform, pdist, cdist, matching, \
jaccard, dice, sokalsneath, rogerstanimoto, \
russellrao, yule, num_obs_y, num_obs_dm, \
-                                   is_valid_dm, is_valid_y
+                                   is_valid_dm, is_valid_y, wminkowski

_filenames = ["iris.txt",
"cdist-X1.txt",
@@ -877,6 +877,32 @@
Y_test2 = pdist(X, 'test_minkowski', 5.8)
self.assertTrue(within_tol(Y_test2, Y_right, eps))

+    ################# wminkowski
+
+    def test_pdist_wminkowski(self):
+        x = np.array([[0.0, 0.0, 0.0],
+                      [1.0, 0.0, 0.0],
+                      [0.0, 1.0, 0.0],
+                      [1.0, 1.0, 1.0]])
+
+        p2_expected = [1.0, 1.0, np.sqrt(3),
+                       np.sqrt(2), np.sqrt(2),
+                       np.sqrt(2)]
+        p1_expected = [0.5, 1.0, 3.5,
+                       1.5, 3.0,
+                       2.5]
+        dist = pdist(x, metric=wminkowski, w=[1.0, 1.0, 1.0])
+        assert_array_equal(dist, p2_expected)
+
+        dist = pdist(x, metric=wminkowski, w=[0.5, 1.0, 2.0], p=1)
+        assert_array_equal(dist, p1_expected)
+
+        dist = pdist(x, metric='wminkowski', w=[1.0, 1.0, 1.0])
+        assert_array_equal(dist, p2_expected)
+
+        dist = pdist(x, metric='wminkowski', w=[0.5, 1.0, 2.0], p=1)
+        assert_array_equal(dist, p1_expected)
+
################### pdist: hamming
def test_pdist_hamming_random(self):
"Tests pdist(X, 'hamming') on random data."

```