[Scipy-svn] r6957 - in trunk/scipy/stats: . tests

scipy-svn@scip... scipy-svn@scip...
Sun Nov 28 07:33:06 CST 2010


Author: rgommers
Date: 2010-11-28 07:33:06 -0600 (Sun, 28 Nov 2010)
New Revision: 6957

Modified:
   trunk/scipy/stats/stats.py
   trunk/scipy/stats/tests/test_stats.py
Log:
ENH: new faster implementation of stats.kendalltau.

The new tests pass with both the old and new implementations. They do not agree
with the version in mstats though - the tie handling there seems different. For
(long) discussion on kendalltau, see scipy-dev around Mar 2009.

Thanks to Enzo Michelangeli for original implementation, David Simcha and
Sturla Molden for alternate implementations, and Josef for extensive testing.

This closes tickets #999 and #893.

Modified: trunk/scipy/stats/stats.py
===================================================================
--- trunk/scipy/stats/stats.py	2010-11-28 13:31:43 UTC (rev 6956)
+++ trunk/scipy/stats/stats.py	2010-11-28 13:33:06 UTC (rev 6957)
@@ -2521,9 +2521,9 @@
     return rpb, prob
 
 
-def kendalltau(x, y):
+def kendalltau(x, y, initial_lexsort=True):
     """
-    Calculates Kendall's tau, a correlation measure for ordinal data
+    Calculates Kendall's tau, a correlation measure for ordinal data.
 
     Kendall's tau is a measure of the correspondence between two rankings.
     Values close to 1 indicate strong agreement, values close to -1 indicate
@@ -2532,44 +2532,147 @@
 
     Parameters
     ----------
-    x : array_like
-        array of rankings
-    y : array_like
-        second array of rankings, must be the same length as x
+    x, y : array_like
+        Arrays of rankings, of the same shape. If arrays are not 1-D, they will
+        be flattened to 1-D.
+    initial_lexsort : bool, optional
+        Whether to use lexsort or quicksort as the sorting method for the
+        initial sort of the inputs. Default is lexsort (True), for which
+        `kendalltau` is of complexity O(n log(n)). If False, the complexity is
+        O(n^2), but with a smaller pre-factor (so quicksort may be faster for
+        small arrays).
 
     Returns
     -------
     Kendall's tau : float
-       The tau statistic
+       The tau statistic.
     p-value : float
        The two-sided p-value for a hypothesis test whose null hypothesis is
        an absence of association, tau = 0.
 
+    References
+    ----------
+    W.R. Knight, "A Computer Method for Calculating Kendall's Tau with
+    Ungrouped Data", Journal of the American Statistical Association, Vol. 61,
+    No. 314, Part 1, pp. 436-439, 1966.
+
+    Notes
+    -----
+    The definition of Kendall's tau that is used is::
+
+      tau = (P - Q) / sqrt((P + Q + T) * (P + Q + U))
+
+    where P is the number of concordant pairs, Q the number of discordant
+    pairs, T the number of ties only in `x`, and U the number of ties only in
+    `y`.  If a tie occurs for the same pair in both `x` and `y`, it is not added
+    to either T or U.
+
+    Examples
+    --------
+    >>> x1 = [12, 2, 1, 12, 2]
+    >>> x2 = [1, 4, 7, 1, 0]
+    >>> tau, p_value = sp.stats.kendalltau(x1, x2)
+    >>> tau
+    -0.47140452079103173
+    >>> p_value
+    0.24821309157521476
+
     """
-    n1 = 0
-    n2 = 0
-    iss = 0
-    for j in range(len(x)-1):
-        for k in range(j+1,len(y)):
-            a1 = x[j] - x[k]
-            a2 = y[j] - y[k]
-            aa = a1 * a2
-            if (aa):             # neither array has a tie
-                n1 = n1 + 1
-                n2 = n2 + 1
-                if aa > 0:
-                    iss = iss + 1
-                else:
-                    iss = iss -1
+
+    x = np.asarray(x).ravel()
+    y = np.asarray(y).ravel()
+    n = len(x)
+    temp = range(n) # support structure used by mergesort
+    # this closure recursively sorts sections of perm[] by comparing
+    # elements of y[perm[]] using temp[] as support
+    # returns the number of swaps required by an equivalent bubble sort
+    def mergesort(offs, length):
+        exchcnt = 0
+        if length == 1:
+            return 0
+        if length == 2:
+            if y[perm[offs]] <= y[perm[offs+1]]:
+                return 0
+            t = perm[offs]
+            perm[offs] = perm[offs+1]
+            perm[offs+1] = t
+            return 1
+        length0 = length / 2
+        length1 = length - length0
+        middle = offs + length0
+        exchcnt += mergesort(offs, length0)
+        exchcnt += mergesort(middle, length1)
+        if y[perm[middle - 1]] < y[perm[middle]]:
+            return exchcnt
+        # merging
+        i = j = k = 0
+        while j < length0 or k < length1:
+            if k >= length1 or (j < length0 and y[perm[offs + j]] <=
+                                                y[perm[middle + k]]):
+                temp[i] = perm[offs + j]
+                d = i - j
+                j += 1
             else:
-                if a1:
-                    n1 = n1 + 1
-                if a2:
-                    n2 = n2 + 1
-    tau = iss / np.sqrt(float(n1*n2))
-    svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1))
+                temp[i] = perm[middle + k]
+                d = (offs + i) - (middle + k)
+                k += 1
+            if d > 0:
+                exchcnt += d;
+            i += 1
+        perm[offs:offs+length] = temp[0:length]
+        return exchcnt
+
+    # initial sort on values of x and, if tied, on values of y
+    if initial_lexsort:
+        # sort implemented as mergesort, worst case: O(n log(n))
+        perm = np.lexsort((y, x))
+    else:
+        # sort implemented as quicksort, 30% faster but with worst case: O(n^2)
+        perm = range(n)
+        perm.sort(lambda a,b: cmp(x[a],x[b]) or cmp(y[a],y[b]))
+
+    # compute joint ties
+    first = 0
+    t = 0
+    for i in xrange(1, n):
+        if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]:
+            t += ((i - first) * (i - first - 1)) / 2
+            first = i
+    t += ((n - first) * (n - first - 1)) / 2
+
+    # compute ties in x
+    first = 0
+    u = 0
+    for i in xrange(1,n):
+        if x[perm[first]] != x[perm[i]]:
+            u += ((i - first) * (i - first - 1)) / 2
+            first = i
+    u += ((n - first) * (n - first - 1)) / 2
+
+    # count exchanges
+    exchanges = mergesort(0, n)
+    # compute ties in y after mergesort with counting
+    first = 0
+    v = 0
+    for i in xrange(1,n):
+        if y[perm[first]] != y[perm[i]]:
+            v += ((i - first) * (i - first - 1)) / 2
+            first = i
+    v += ((n - first) * (n - first - 1)) / 2
+
+    tot = (n * (n - 1)) / 2
+    if tot == u and tot == v:
+        return 1    # Special case for all ties in both ranks
+
+    tau = ((tot - (v + u - t)) - 2.0 * exchanges) / \
+                    np.sqrt((tot - u) * (tot - v))
+
+    # what follows reproduces the ending of Gary Strangman's original
+    # stats.kendalltau() in SciPy
+    svar = (4.0 * n + 10.0) / (9.0 * n * (n - 1))
     z = tau / np.sqrt(svar)
-    prob = special.erfc(abs(z)/1.4142136)
+    prob = special.erfc(np.abs(z) / 1.4142136)
+
     return tau, prob
 
 

Modified: trunk/scipy/stats/tests/test_stats.py
===================================================================
--- trunk/scipy/stats/tests/test_stats.py	2010-11-28 13:31:43 UTC (rev 6956)
+++ trunk/scipy/stats/tests/test_stats.py	2010-11-28 13:33:06 UTC (rev 6957)
@@ -529,12 +529,43 @@
 ##    should appear on the diagonal and the total should be 899999955.
 ##    If the table cannot hold these values, forget about working with
 ##    census data.  You can also tabulate HUGE against TINY.  There is no
-##    reason a tabulation program should not be able to digtinguish
+##    reason a tabulation program should not be able to distinguish
 ##    different values regardless of their magnitude.
 
 ### I need to figure out how to do this one.
 
 
+def test_kendalltau():
+    """Some tests for kendalltau."""
+
+    # with some ties
+    x1 = [12, 2, 1, 12, 2]
+    x2 = [1, 4, 7, 1, 0]
+    res = (-0.47140452079103173, 0.24821309157521476)
+    expected = stats.kendalltau(x1, x2)
+    assert_approx_equal(res[0], expected[0])
+    assert_approx_equal(res[1], expected[1])
+
+    # check two different sort methods
+    assert_approx_equal(stats.kendalltau(x1, x2, initial_lexsort=False)[1],
+                        stats.kendalltau(x1, x2, initial_lexsort=True)[1])
+
+    # and with larger arrays
+    np.random.seed(7546)
+    x = np.array([np.random.normal(loc=1, scale=1, size=500),
+                np.random.normal(loc=1, scale=1, size=500)])
+    corr = [[1.0, 0.3],
+            [0.3, 1.0]]
+    x = np.dot(np.linalg.cholesky(corr), x)
+    expected = (0.19291382765531062, 1.1337108207276285e-10)
+    res = stats.kendalltau(x[0], x[1])
+    assert_approx_equal(res[0], expected[0])
+    assert_approx_equal(res[1], expected[1])
+
+    # and do we get a tau of 1 for identical inputs?
+    assert_approx_equal(stats.kendalltau([1,1,2], [1,1,2])[0], 1.0)
+
+
 class TestRegression(TestCase):
     def test_linregressBIGX(self):
         """ W.II.F.  Regress BIG on X.



More information about the Scipy-svn mailing list