# [Scipy-svn] r2251 - in trunk/Lib: sandbox/stats stats

scipy-svn at scipy.org scipy-svn at scipy.org
Tue Oct 10 00:53:47 CDT 2006

```Author: oliphant
Date: 2006-10-10 00:53:33 -0500 (Tue, 10 Oct 2006)
New Revision: 2251

Modified:
trunk/Lib/sandbox/stats/anova.py
trunk/Lib/stats/distributions.py
trunk/Lib/stats/morestats.py
Log:
Fix sandbox/anova a bit.  Fix distributions gengamma and invgamma to use gammaln and exp to avoid precision loss.  Clean up bayes_mvs which returns mean, variance, and standard deviation with confidence intervals using Jeffrey's priors

Modified: trunk/Lib/sandbox/stats/anova.py
===================================================================
--- trunk/Lib/sandbox/stats/anova.py	2006-10-10 00:30:16 UTC (rev 2250)
+++ trunk/Lib/sandbox/stats/anova.py	2006-10-10 05:53:33 UTC (rev 2251)
@@ -2,6 +2,8 @@
# this module. No care has been taken to ensure that it works in its current
# state. At minimum, you will have to figure out what it needs to import to work.

+from numpy import sum
+
def anova(data, effects=['A','B','C','D','E','F','G','H','I','J','K']):
"""
Prints the results of single-variable between- and within-subject ANOVA
@@ -355,8 +357,8 @@
sourceNarray = apply_over_axes(hmean, Narray,btwnonsourcedims)

## Calc grand average (ga,axis=0), used for ALL effects
-            ga = sum((sourceMarray*sourceNarray)/
-                            sum(sourceNarray,axis=0),axis=0)
+            ga = sum((sourceMarray*sourceNarray)/ \
+                     sum(sourceNarray,axis=0),axis=0)
ga = reshape(ga,ones(len(Marray.shape)))

## If GRAND interaction, use harmonic mean of ALL cell Ns

Modified: trunk/Lib/stats/distributions.py
===================================================================
--- trunk/Lib/stats/distributions.py	2006-10-10 00:30:16 UTC (rev 2250)
+++ trunk/Lib/stats/distributions.py	2006-10-10 05:53:33 UTC (rev 2251)
@@ -18,6 +18,7 @@
import numpy
import numpy.random as mtrand
from numpy import flatnonzero as nonzero
+from scipy.special import gammaln as gamln

__all__ = [
'rv_continuous',
@@ -1707,7 +1708,7 @@
def _argcheck(self, a, c):
return (a > 0) & (c != 0)
def _pdf(self, x, a, c):
-        return abs(c)* x**(c*a-1) / special.gamma(a) * exp(-x**c)
+        return abs(c)* exp((c*a-1)*log(x)-x**c- special.gammaln(a))
def _cdf(self, x, a, c):
val = special.gammainc(a,x**c)
cond = c + 0*val
@@ -1973,13 +1974,13 @@

class invgamma_gen(rv_continuous):
def _pdf(self, x, a):
-        return x**(-a-1) / special.gamma(a) * exp(-1.0/x)
+        return exp(-(a+1)*log(x)-special.gammaln(a) - 1.0/x)
def _cdf(self, x, a):
return 1.0-special.gammainc(a, 1.0/x)
def _ppf(self, q, a):
return 1.0/special.gammaincinv(a,1-q)
def _munp(self, n, a):
-        return special.gamma(a-n) / special.gamma(a)
+        return exp(special.gammaln(a-n) - special.gammaln(a))
def _entropy(self, a):
return a - (a+1.0)*special.psi(a) + special.gammaln(a)
invgamma = invgamma_gen(a=0.0, name='invgamma',longname="An inverted gamma",
@@ -2232,7 +2233,7 @@
#
class loggamma_gen(rv_continuous):
def _pdf(self, x, c):
-        return exp(c*x-exp(x))/ special.gamma(c)
+        return exp(c*x-exp(x)-special.gammaln(c))
def _cdf(self, x, c):
return special.gammainc(c, exp(x))/ special.gamma(c)
def _ppf(self, q, c):
@@ -2446,20 +2447,21 @@
return mtrand.noncentral_f(dfn,dfd,nc,self._size)
def _pdf(self, x, dfn, dfd, nc):
n1,n2 = dfn, dfd
-        Px = exp(-nc/2+nc*n1*x/(2*(n2+n1*x)))
+        term = -nc/2+nc*n1*x/(2*(n2+n1*x)) + gamln(n1/2.)+gamln(1+n2/2.)
+        term -= gamln((n1+n2)/2.0)
+        Px = exp(term)
Px *= n1**(n1/2) * n2**(n2/2) * x**(n1/2-1)
Px *= (n2+n1*x)**(-(n1+n2)/2)
-        Px *= special.gamma(n1/2)*special.gamma(1+n2/2)
Px *= special.assoc_laguerre(-nc*n1*x/(2.0*(n2+n1*x)),n2/2,n1/2-1)
-        Px /= special.beta(n1/2,n2/2)*special.gamma((n1+n2)/2.0)
+        Px /= special.beta(n1/2,n2/2)
def _cdf(self, x, dfn, dfd, nc):
return special.ncfdtr(dfn,dfd,nc,x)
def _ppf(self, q, dfn, dfd, nc):
return special.ncfdtri(dfn, dfd, nc, q)
def _munp(self, n, dfn, dfd, nc):
val = (dfn *1.0/dfd)**n
-        val *= gam(n+0.5*dfn)*gam(0.5*dfd-n) / gam(dfd*0.5)
-        val *= exp(-nc / 2.0)
+        term = gamln(n+0.5*dfn) + gamln(0.5*dfd-n) - gamln(dfd*0.5)
+        val *= exp(-nc / 2.0+term)
val *= special.hyp1f1(n+0.5*dfn, 0.5*dfn, 0.5*nc)
return val
def _stats(self, dfn, dfd, nc):
@@ -2528,8 +2530,9 @@
x2 = x*x
ncx2 = nc*nc*x2
fac1 = n + x2
-        Px = n**(n/2) * special.gamma(n+1)
-        Px /= arr(2.0**n*exp(nc*nc/2)*fac1**(n/2)*special.gamma(n/2))
+        trm1 = n/2.*log(n) + gamln(n+1)
+        trm1 -= n*log(2)+nc*nc/2.+(n/2.)*log(fac1)+gamln(n/2.)
+        Px = exp(trm1)
valF = ncx2 / (2*fac1)
trm1 = sqrt(2)*nc*x*special.hyp1f1(n/2+1,1.5,valF)
trm1 /= arr(fac1*special.gamma((n+1)/2))

Modified: trunk/Lib/stats/morestats.py
===================================================================
--- trunk/Lib/stats/morestats.py	2006-10-10 00:30:16 UTC (rev 2250)
+++ trunk/Lib/stats/morestats.py	2006-10-10 05:53:33 UTC (rev 2251)
@@ -40,6 +40,32 @@
###  Bayesian confidence intervals for mean, variance, std
##########################################################

+# assume distributions are gaussian with given means and variances.
+def _gauss_mvs(x, n, alpha):
+    xbar = x.mean()
+    C = x.var()
+    val = distributions.norm.ppf((1+alpha)/2.0)
+    # mean is a Gaussian with mean xbar and variance C/n
+    mp = xbar
+    fac0 = sqrt(C/n)
+    term = fac0*val
+    ma = mp - term
+    mb = mp + term
+    # var is a Gaussian with mean C and variance 2*C*C/n
+    vp = C
+    fac1 = sqrt(2.0/n)*C
+    term = fac1*val
+    va = vp - term
+    vb = vp + term
+    # std is a Gaussian with mean sqrt(C) and variance C/(2*n)
+    st = sqrt(C)
+    fac2 = sqrt(0.5)*fac0
+    term = fac2*val
+    sta = st - term
+    stb = st + term
+    return mp, (ma, mb), vp, (va, vb), st, (sta, stb)
+
+
##  Assumes all is known is that mean, and std (variance,axis=0) exist
##   and are the same for all the data.  Uses Jeffrey's prior
##
@@ -51,12 +77,14 @@

Assumes 1-d data all has same mean and variance and uses Jeffrey's prior
for variance and std.
-    alpha gives the probability that the returned interval contains
+
+    alpha gives the probability that the returned confidence interval contains
the true parameter.

-    Uses peak of conditional pdf as starting center.
+    Uses mean of conditional pdf as center estimate
+    (but centers confidence interval on the median)

-    Returns (peak, (a, b)) for each of mean, variance and standard deviation.
+    Returns (center, (a, b)) for each of mean, variance and standard deviation.
Requires 2 or more data-points.
"""
x = ravel(data)
@@ -64,48 +92,43 @@
assert(n > 1)
assert(alpha < 1 and alpha > 0)
n = float(n)
-    C = sb.add.reduce(x*x)/n - xbar*xbar
-    #
+    if (n > 1000): # just a guess.  The curves look similar at this point.
+        return _gauss_mvs(x, n, alpha)
+    xbar = x.mean()
+    C = x.var()
+    # mean
fac = sqrt(C/(n-1))
tval = distributions.t.ppf((1+alpha)/2.0,n-1)
delta = fac*tval
ma = xbar - delta
mb = xbar + delta
mp = xbar
-    #
+    # var
fac = n*C/2.0
-    a = (n-1)/2.0
-    if (n > 3): # use mean of distribution as center
-        peak = 2/(n-3.0)
-        F_peak = distributions.invgamma.cdf(peak,a)
-    else: # use median
-        F_peak = -1.0
-    if (F_peak < alpha/2.0):
+    a = (n-1)/2
+    if (n < 4):
peak = distributions.invgamma.ppf(0.5,a)
-        F_peak = 0.5
-    q1 = F_peak - alpha/2.0
-    q2 = F_peak + alpha/2.0
-    if (q2 > 1): q2 = 1.0
+    else:
+        peak = 2.0/(n-3.0)
+    q1 = (1-alpha)/2.0
+    q2 = (1+alpha)/2.0
va = fac*distributions.invgamma.ppf(q1,a)
vb = fac*distributions.invgamma.ppf(q2,a)
vp = peak*fac
-    #
+    # std
fac = sqrt(fac)
-    if (n > 2):
-        peak = special.gamma(a-0.5) / special.gamma(a)
-        F_peak = distributions.gengamma.cdf(peak,a,-2)
-    else: # use median
-        F_peak = -1.0
-    if (F_peak < alpha/2.0):
+    if (n < 3):
peak = distributions.gengamma.ppf(0.5,a,-2)
-        F_peak = 0.5
-    q1 = F_peak - alpha/2.0
-    q2 = F_peak + alpha/2.0
-    if (q2 > 1): q2 = 1.0
+        stp = fac*peak
+    else:
+        ndiv2 = (n-1)/2.0
+        term = special.gammaln(ndiv2-0.5)-special.gammaln(ndiv2)
+        term += (log(n)+log(C)-log(2.0))*0.5
+        stp = exp(term)
+    q1 = (1-alpha)/2.0
+    q2 = (1+alpha)/2.0
sta = fac*distributions.gengamma.ppf(q1,a,-2)
stb = fac*distributions.gengamma.ppf(q2,a,-2)
-    stp = peak*fac

return (mp,(ma,mb)),(vp,(va,vb)),(stp,(sta,stb))

```