[SciPy-user] improving ML estimation of distributions - example

josef.pktd@gmai... josef.pktd@gmai...
Mon Feb 16 22:49:14 CST 2009


I was trying out some of the improvements to the maximum likelihood
estimation of distribution parameters with fit:

* allow some parameters to be fixed, especially location for
distribution with finite upper or lower bound
* get automatic "meaningful" starting values from the data instead of
default (1,1,1,..)
* define loglike directly if it is a shortcut, instead of np.log(pdf)

I tried the changes for the gamma distribution, which has a lower
bound of zero if location is at default zero:

Here are some monte carlo results:
* about 3 times faster for fixed location,
* Mean Squared Error about half of current version for sample sizes
around 100 to 200

time stats.gamma 41.375
time   new gamma 13.1560001373
 sample size = 200,  number of iterations = 500

         with fixed location        with estimated location
          shape       scale       shape       location    scale
bias   [ 0.0371193  -0.10947458 -0.08416937  0.94610597  0.77749813]
errvar [ 0.06138217  4.47976327  0.19587426  6.46472802  8.44292793]
mse    [ 0.06276001  4.49174795  0.20295874  7.35984453  9.04743127]
maxabs [ 1.31184619   7.43273778   2.0578926   12.45965912   8.23351093]
mad    [ 0.19163682  1.65387385  0.35096994  2.13576654  2.36227371]


data, rvsg, was generated with parameters [2.5, 0.0, 20]

>>> stats.gamma.fit(rvsg)         # unconstrained current stats version
array([  2.41650867,   1.25529111,  20.06830571])
>>> gamma.fit_fr(rvsg, 1)          # unconstrained new version
array([  2.41650867,   1.25529111,  20.06830571])
>>> gamma.fit_fr(rvsg)              # unconstrained new version with method of moment starting values
array([  2.41650354,   1.25536201,  20.06831229])


>>> gamma.fit_fr(rvsg, frozen=[np.nan,0.0,np.nan])     # fix location at 0
array([  2.60144762,  19.12416175])
>>> gamma.fit_fr(rvsg, frozen=[2.5,0.0,np.nan])   # fix shape and location,  estimate scale
array([ 19.90018213])
>>> gamma.fit_fr(rvsg, frozen=[2.5,0.0,20])    # fix all parameters
array([], dtype=float64)
>>> gamma.fit_fr(rvsg, frozen=[np.nan,0.0,20])   # fix location and scale, estimate shape
array([ 2.5077181])

Given the constraint estimates, we can compare the log likelihood
values at constrained or unconstrained ML estimate. Can be used to
construct likelihood ratio tests:

>>> gamma.nnlf([2.5,0.0,20], rvsg)   # neg. log-likelihood at true values
941.50470025907089
>>> gamma.nnlf([  2.41650867,   1.25529111,  20.06830571], rvsg)   # neg. log-likelihood at unconstrained estimates
941.30423824721584
>>> gamma.nnlf([  2.60144762, 0.0,  19.12416175], rvsg)  # neg. log-likelihood at constrained estimates
941.41052750929691

I picked the format for the frozen mask mostly because it is easy and
fast to switch between reduced and full parameter vectors

# select parameters that needs to be estimated:
x0  = np.array(x0)[np.isnan(frmask)]

# insert reduced set of estimation parameters to the full parameter vector
theta = frmask.copy()
theta[np.isnan(frmask)] = thetashort

I appreciate comments, especially if someone has an opinion about the
API of the added functionality. These changes to the fit method will
be backward compatible (except when different starting value results
in different estimates).

Adding starting values and log-likelihood function to each individual
distribution is work and will take some time, but adding partially
frozen parameters to the generic fit method is relatively simple.

my working file with the gamma distribution is attached.

Josef
-------------- next part --------------

import numpy as np
from scipy import stats, special, optimize
from scipy.stats import distributions


class gamma_gen(distributions.rv_continuous):
    def _rvs(self, a):
        return mtrand.standard_gamma(a, self._size)
    def _pdf(self, x, a):
        return x**(a-1)*np.exp(-x)/special.gamma(a)
    def _loglike(self, x, a):
        return (a-1)*np.log(x) - x - special.gammaln(a)
    def _cdf(self, x, a):
        return special.gammainc(a, x)
    def _ppf(self, q, a):
        return special.gammaincinv(a,q)
    def _stats(self, a):
        return a, a, 2.0/np.sqrt(a), 6.0/a
    def _entropy(self, a):
        return special.psi(a)*(1-a) + 1 + special.gammaln(a)

    def _nnlf_(self, x, *args):   # inherited version for comparison
        return -np.sum(np.log(self._pdf(x, *args)),axis=0)

    
    def _nnlf(self, x, *args):   # overwrite ic subclass
        return -np.sum(self._loglike(x, *args),axis=0)
    
    def _fitstart(self, x):
        # method of moment estimator as starting values, not verified
        # with literature
        loc = np.min([x.min(),0])
        a = 4/stats.skew(x)**2
        scale = np.std(x) / np.sqrt(a)
        return (a, loc, scale)

    def nnlf_fr(self, thetash, x, frmask):
        # new frozen version
        # - sum (log pdf(x, theta),axis=0)
        #   where theta are the parameters (including loc and scale)
        #
        try:
            if frmask != None:
                theta = frmask.copy()
                theta[np.isnan(frmask)] = thetash
            else:
                theta = thetash
            loc = theta[-2]
            scale = theta[-1]
            args = tuple(theta[:-2])
        except IndexError:
            raise ValueError, "Not enough input arguments."
        if not self._argcheck(*args) or scale <= 0:
            return np.inf
        x = np.array((x-loc) / scale)
        cond0 = (x <= self.a) | (x >= self.b)
        if (np.any(cond0)):
            return np.inf
        else:
            N = len(x)
            #raise ValueError
            return self._nnlf(x, *args) + N*np.log(scale)

    def fit_fr(self, data, *args, **kwds):
        loc0, scale0 = map(kwds.get, ['loc', 'scale'],[0.0, 1.0])
        Narg = len(args)
        
        if Narg == 0 and hasattr(self, '_fitstart'):
            x0 = self._fitstart(data)
        elif Narg > self.numargs:
            raise ValueError, "Too many input arguments."
        else:
            args += (1.0,)*(self.numargs-Narg)
            # location and scale are at the end
            x0 = args + (loc0, scale0)
        
        if 'frozen' in kwds:
            frmask = np.array(kwds['frozen'])
            if len(frmask) != self.numargs+2:
                raise ValueError, "Incorrect number of frozen arguments."
            else:
                # keep starting values for not frozen parameters
                x0  = np.array(x0)[np.isnan(frmask)]
        else:
            frmask = None
            
        #print x0
        #print frmask
        return optimize.fmin(self.nnlf_fr, x0,
                    args=(np.ravel(data), frmask), disp=0)

    
gamma = gamma_gen(a=0.0,name='gamma',longname='A gamma',
                  shapes='a',extradoc="""

Gamma distribution

For a = integer, this is the Erlang distribution, and for a=1 it is the
exponential distribution.

gamma.pdf(x,a) = x**(a-1)*exp(-x)/gamma(a)
for x >= 0, a > 0.
"""
                  )


rvsg = stats.gamma.rvs(2.5,scale=20,size=1000)
print rvsg.min()

print gamma.fit(rvsg)
print gamma.fit_fr(rvsg, frozen=[np.nan,0.0,np.nan])


import time

niter = 500
ssize = 200

t0 = time.time()
result1 = []
np.random.seed(64398721)
for ii in range(niter):
    rvsg = stats.gamma.rvs(2.5,scale=20,size=ssize)
    result1.append(np.hstack(stats.gamma.fit(rvsg)))
                   
t1 = time.time()
result2 = []
np.random.seed(64398721)
for ii in range(niter):
    rvsg = stats.gamma.rvs(2.5,scale=20,size=ssize)
    result2.append(gamma.fit_fr(rvsg, frozen=[np.nan,0.0,np.nan]))
    #result2.append(gamma.fit_fr(rvsg,1)) # this is equivalent to old

t2 = time.time()
print 'time stats.gamma', t1-t0
print 'time   new gamma', t2-t1
resarr1 = np.array(result1)
resarr2 = np.array(result2)
resarr = np.hstack([resarr2, resarr1])
ptrue = np.array([2.5,20.0,2.5,0.0,20.0])#[:2]
#ptrue = np.array([2.5,0.0,20.0,2.5,0.0,20.0])

print ' sample size = %d,  number of iterations = %d' % (ssize, niter)
print '         with fixed location        with estimated location'
print '          shape       scale       shape       location    scale'
bias = np.mean((resarr - ptrue), axis=0)
errvar = np.var((resarr - ptrue), axis=0)
maxabs = np.max(np.abs(resarr - ptrue), axis=0)
mad = np.mean(np.abs(resarr - ptrue), axis=0)
mse = np.mean((resarr - ptrue)**2, axis=0)
print 'bias  ', bias
print 'errvar', errvar
print 'mse   ', mse
print 'maxabs', maxabs
print 'mad   ', mad


More information about the SciPy-user mailing list