[SciPy-user] bug in stats.pdfapprox/stats.pdf_moment and new Gram-Charlier distribution

josef.pktd@gmai... josef.pktd@gmai...
Sun Feb 1 11:02:38 CST 2009


I wanted to create a new distribution by wrapping stats.pdfapprox and
stats.pdf_moment in stats.morestats.py.

However, these two function do not create a normal expansion if the
first four moments are given. The inner loop in stats.pdf_moment is
never entered when four moments are given. As a consequence the pdf
that is returned is the unexpanded normal distribution. (There is also
a small mistake in stats.pdfapprox in how the variance is calculated.)

I didn't find any information which type of expansion is used by
pdf_moment. I assume it is Gram-Charlier, but I didn't find any
formulas to make sense out of the inner loop that calculates the
coefficients (for multiplying with the Hermite polynomials).

If someone could provide a (understandable) reference for these
calculations or figure out what the loop is supposed to do, then we
could correct the expansion for the general case.


Since I couldn't fix `pdf_moment`, I wrote a new function that
calculates the pdf for the Gram-Charlier expansion when the first four
moments (or mean, variance, skew, kurtosis) are given. This uses the
explicit formula for this expansion, and doesn't allow for higher
order expansion.

pdf_mvsk: get pdf of G-Ch normal expansion using mean, variance, skew,
and excess kurtosis

This I wrapped in a subclass of _distributionsrv_continuous: NormExpan_gen
It works in the examples that I tried but is not fully tested or cleaned up yet.

attachment:
* try_pdfapprox.py shows problem with current function
* distr_gch.py  new expansion pdf, and NormExpan distribution

I also wrote a skew normal and skew t distribution (as defined by
Azzalini, A. & Capitanio, A., univariate only), which is not attached.

Josef
-------------- next part --------------
from scipy import stats, special
from scipy.stats import distributions
import numpy as np


def mvsk2cm(args):
    mu,sig,sk,kur = args
    # Get central moments
    cnt = [None]*4
    cnt[0] = mu
    cnt[1] = sig #*sig
    cnt[2] = sk * sig**1.5
    cnt[3] = (kur+3.0) * sig**2.0
    return cnt


rvs = stats.norm.rvs(5,size=(2,100)).max(axis=0)
mvsk = stats.describe(rvs)[2:]
print 'sample: mu,sig,sk,kur'
print mvsk

mc = mvsk2cm(mvsk)

pdffn1 = stats.pdfapprox(rvs)
print '\npdf approximation from sample'
print 'pdf at mean-1, mean+1', mc[0]-1,mc[0]+1
print pdffn1([mc[0]-1,mc[0]+1])

pdffn2 = stats.pdf_moments(mc)
print '\npdf approximation from moments'
print 'pdf at mean-1, mean+1', mc[0]-1,mc[0]+1
print pdffn2([mc[0]-1,mc[0]+1])
-------------- next part --------------
'''Gram-Charlier distribution, four-moment expansion of normal distribution

'''

from scipy import stats, special
from scipy.stats import distributions
import numpy as np


def mvsk2cm(*args):
    mu,sig2,sk,kur = args
    # Get central moments
    cnt = [None]*4
    cnt[0] = mu
    cnt[1] = sig2 #*sig  wrong in stats.pdfapprox
    cnt[2] = sk * sig2**1.5
    cnt[3] = (kur+3.0) * sig2**2.0
    return cnt

def mc2mvsk(args):
    mc, mc2, mc3, mc4 = args
    skew = mc3 / mc2**1.5
    kurt = mc4 / mc2**2.0 - 3.0
    return (mc, mc2, skew, kurt)

def pdf_mvsk(mvsk):
    """Return the Gaussian expanded pdf function given the list of 1st, 2nd
    moment and skew and Fisher (excess) kurtosis.

    

    Parameters
    ----------
    mvsk : list of mu, mc2, skew, kurt
        distribution is matched to these four moments

    Returns
    -------
    pdffunc : function
        function that evaluates the pdf(x), where x is the non-standardized
        random variable.


    Notes
    -----

    Changed so it works only if four arguments are given. Uses explicit
    formula, not loop.

    This implements a Gram-Charlier expansion of the normal distribution
    where the first 2 moments coincide with those of the normal distribution
    but skew and kurtosis can deviate from it.

    In the Gram-Charlier distribution it is possible that the density
    becomes negative. This is the case when the deviation from the
    normal distribution is too large.

    

    References
    ----------
    http://en.wikipedia.org/wiki/Edgeworth_series
    Johnson N.L., S. Kotz, N. Balakrishnan: Continuous Univariate
    Distributions, Volume 1, 2nd ed., p.30    
    """
    N = len(mvsk)
    if N < 4:
        raise ValueError, "Four moments must be given to" + \
              "approximate the pdf."

    mu, mc2, skew, kurt = mvsk
    
    totp = poly1d(1)
    sig = sqrt(mc2)
    if N > 2:
        Dvals = stats.morestats._hermnorm(N+1)
        C3 = skew/6.0
        C4 = kurt/24.0
        # Note: Hermite polynomial for order 3 in _hermnorm is negative
        # instead of positive
        totp = totp -  C3*Dvals[3] +  C4*Dvals[4]

    def pdffunc(x):
        xn = (x-mu)/sig
        return totp(xn)*np.exp(-xn*xn/2.0)/np.sqrt(2*np.pi)/sig
    return pdffunc


class NormExpan_gen(distributions.rv_continuous):
    def __init__(self,args, **kwds):
        distributions.rv_continuous.__init__(self,
            name = 'Normal Expansion distribution', 
            extradoc = '''
        The distribution is defined as the Gram-Charlier expansion of
        the normal distribution using the first four moments. The pdf
        is given by

        pdf(x) = (1+ skew/6.0 * H(xc,3) + kurt/24.0 * H(xc,4))*normpdf(xc)

        where xc = (x-mu)/sig is the standardized value of the random variable
        and H(xc,3) and H(xc,4) are Hermite polynomials.
        
        Note: This distribution has to be parameterized during
        initialization and instantiation, and does not have a shape
        parameter after instantiation (similar to frozen distribution
        except for location and scale.) Location and scale can be used
        as with other distributions, however note, that they are relative
        to the initialized distribution.
        '''  )

        mode = kwds.get('mode', 'sample')

        if mode == 'sample':                            
            mu,sig2,sk,kur = stats.describe(args)[2:]
            self.mvsk = (mu,sig2,sk,kur)
            cnt = mvsk2cm(mu,sig2,sk,kur)
        elif mode == 'mvsk':
            cnt = mvsk2cm(args)
            self.mvsk = args
        elif mode == 'centmom':
            cnt = args
            self.mvsk = mc2mvsk(cnt)
        else:
            raise ValueError, "mode must be 'mvsk' or centmom"
            
        self.cnt = cnt
        self._pdf = pdf_mvsk(self.mvsk)

    def _munp(self,n):
        # use pdf integration with _mom0_sc if only _pdf is defined.
        # default stats calculation uses ppf
        return self._mom0_sc(n)

    def _stats_skip(self):
        # skip for now to force numerical integration of pdf for testing
        return self.mvsk


if __name__ == '__main__':
    
    rvs = skewnorm.rvs(5,size=100)
    normexpan = NormExpan_gen(rvs, mode='sample')

    smvsk = stats.describe(rvs)[2:]
    print 'sample: mu,sig2,sk,kur'
    print smvsk

    dmvsk = normexpan.stats(moments='mvsk')
    print 'normexpan: mu,sig2,sk,kur'
    print dmvsk
    print 'mvsk diff distribution - sample'
    print np.array(dmvsk) - np.array(smvsk)
    print '\nnormexpan attributes mvsk'
    print mc2mvsk(normexpan.cnt)
    print normexpan.mvsk

    print '\nusing methods'
    print normexpan.rvs(size=5)  #slow
    print normexpan.cdf([-1,0,1,2])
    print normexpan.pdf([-1,0,1,2])
    print normexpan.ppf([0,0.1,0.5,0.9,0.95,1])
    print normexpan.sf([0,0.1,0.5,0.9,0.95,1])


More information about the SciPy-user mailing list