[SciPy-user] help with scipy.stats.mannwhitneyu

Sturla Molden sturla@molden...
Thu Feb 5 13:51:02 CST 2009


On 2/5/2009 7:03 PM, Sturla Molden wrote:

> By the way, there is a fucntion scipy.stats.ranksums that does a 
> Wilcoxon rank-sum test. It seems to be using a large-sample 
> approximation, and has no correction for tied ranks.


Here is a modification of SciPy's ranksums to allow small samples and 
correct for tied ranks.

Sturla Molden



import numpy as np

import scipy
import scipy.special
zprob = scipy.special.ndtr


def ranksums(x, y):
     """
     Wilcoxon rank sum test

     Returns:
	W-statistic
	Z-statistic
	one-tailed p-value, asymptotic approximation
	one-tailed p-value, Monte Carlo approximation

     Corrected for ties.
     """

     x,y = map(np.asarray, (x, y))
     n1 = len(x)
     n2 = len(y)
     alldata = np.concatenate((x,y))
     ranked = rankdata(alldata)
     x = ranked[:n1]
     y = ranked[n1:]
     w = np.sum(x,axis=0)

     def montecarlo():
         shuffle = np.random.shuffle
         a = np.zeros(1000)
         shuffle(ranked) # bug in numpy: the first shuffle doesn't work
         for i in xrange(1000):
             shuffle(ranked)
             a[i] = np.sum(ranked[:n1],axis=0)
         return np.sum(a >= w) / 1000.0

     def aymptotic_p():
         expected = n1*(n1+n2+1) / 2.0
         z = (w - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
         return 1.0 - zprob(z), z

     def aymptotic_p_ties():
         t = []
         _t = 0
         for r in ranked:
             if r % 1:
                 _t += 1
             else:
                 if _t:
                     t.append(_t)
                     _t = 0
         if _t: t.append(_t)
         t = np.asarray(t)
         expected = n1*(n1+n2+1) / 2.0
         tcorr = np.sum((t-1)*t*(t+1))/float((n1+n2)*(n1+n2-1))
         z = (w - expected) / np.sqrt(n1*n2*(n1+n2+1-tcorr)/12.0)
         return 1.0 - zprob(z), z

     p_mc = montecarlo()
     if np.any(ranked % 1):
         p, z = aymptotic_p_ties()
     else:
         p, z = aymptotic_p()
     return w, z, p, p_mc



def rankdata(a):
     a = np.ravel(a)
     n = len(a)
     svec, ivec = fastsort(a)
     sumranks = 0
     dupcount = 0
     newarray = np.zeros(n, float)
     for i in xrange(n):
         sumranks += i
         dupcount += 1
         if i==n-1 or svec[i] != svec[i+1]:
             averank = sumranks / float(dupcount) + 1
             for j in xrange(i-dupcount+1,i+1):
                 newarray[ivec[j]] = averank
             sumranks = 0
             dupcount = 0
     return newarray



def fastsort(a):
     it = np.argsort(a)
     as_ = a[it]
     return as_, it



More information about the SciPy-user mailing list