Mon Mar 9 19:18:16 CDT 2009

```Here is what I have based on pearsonr in scipy.stats:

def sparse_vector_dot(x, y):
'''Calculates the dot product for two sparse vectors'''
return (x.T*y).data[0]

def sparse_pearsonr(x, y):
"""Calculates a Pearson correlation coefficient and the p-value for
testing
non-correlation using two sparse vectors as inputs.

Parameters
----------
x : 1D sparse array
y : 1D sparse array the same length as x

Returns
-------
(Pearson's correlation coefficient,
2-tailed p-value)

References
----------
http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation"""

# we form a third sparse vector z where the nonzero entries of z
# are the union of the nonzero entries in x and y
z = x + y
n = z.getnnz() #length of x
mx = x.data.mean()
my = y.data.mean()
# we only want to subtract the mean for non-zero values...
# so we copy & access the sparse vector components directly:
xm, ym = x, y
xm.data, ym.data = x.data-mx, y.data-my
r_num = n*(sparse_vector_dot(xm,ym))
r_den = n*sqrt(sparse_vector_dot(xm,xm)*sparse_vector_dot(ym,ym))
r = (r_num / r_den)

# Presumably, if r > 1, then it is only some small artifact of floating
# point arithmetic.
r = min(r, 1.0)
df = n-2

# Use a small floating point value to prevent divide-by-zero nonsense
# fixme: TINY is probably not the right value and this is probably not
# the way to be robust. The scheme used in spearmanr is probably better.
TINY = 1.0e-20
t = r*sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
prob = betai(0.5*df,0.5,df/(df+t*t))
return r,prob

