[SciPy-user] goodness-of-fit test - power discrepancy

josef.pktd@gmai... josef.pktd@gmai...
Mon Apr 27 10:48:09 CDT 2009


A while ago, I wrote a function for power discrepancy which is a
generalization of the chi-square test for goodness-of-fit for discrete
samples or probabilities.

If someone wants to try it, I would welcome comments, before adding it
to scipy.stats.

Josef
-------------- next part --------------
'''

Author: Josef Perktold and pymc authors

'''

import numpy as np
from numpy.linalg.linalg import LinAlgError
import numpy.linalg as nplinalg
from numpy.testing import assert_almost_equal, assert_equal, assert_array_almost_equal

from scipy import linalg, stats

#JP: note: Freeman-Tukey statistics is a goodness-of-fit test for
#    discrete data,
##    other gof tests
##    The test statistics:
##    Pearson?s Chi-Square,
##    the Kolmogorov-Smirnov test statistic for discrete data,
##    the Log-Likelihood Ratio,
##    the Freeman-Tukey and
##    the Power Divergence statistic with ?=?.


def discrepancy(observed, simulated, expected):
    """Calculates Freeman-Tukey statistics (Freeman and Tukey 1950) as
    a measure of discrepancy between observed and simulated data. This
    is a convenient method for assessing goodness-of-fit (see Brooks et al. 2000).

    D(x|\theta) = \sum_j (\sqrt{x_j} - \sqrt{e_j})^2

    :Parameters:
      observed : Iterable of observed values
      simulated : Iterable of simulated values
      expected : Iterable of expeted values

    :Returns:
      D_obs : Discrepancy of observed values
      D_sim : Discrepancy of simulated values
    
    Notes:
    ------
    this is the pymc version

    """

    D_obs = np.sum([(np.sqrt(x)-np.sqrt(e))**2 for x,e in zip(observed, expected)])
    D_sim = np.sum([(np.sqrt(s)-np.sqrt(e))**2 for s,e in zip(simulated, expected)])

    return D_obs, D_sim


def powerdiscrepancy(o, e, lambd=0.0, axis=0):
    """Calculates power discrepancy, a class of goodness-of-fit tests
    as a measure of discrepancy between observed and expected data.

    This contains several goodness-of-fit tests as special cases, see the
    describtion of lambd, the exponent of the power discrepancy. The pvalue
    is based on the asymptotic chi-square distribution of the test statistic.

    freeman_tukey:
    D(x|\theta) = \sum_j (\sqrt{x_j} - \sqrt{e_j})^2

    Parameters
    ----------
      o : Iterable of observed values
      e : Iterable of expeted values
      lambd : float or string
         * float : exponent `a` for power discrepancy
         * 'loglikeratio': a = 0
         * 'freeman_tukey': a = -0.5
         * 'pearson': a = 1
         * 'modified_loglikeratio': a = -1
         * 'cressie_read': a = 2/3

    Returns
    -------
      D_obs : Discrepancy of observed values
      pvalue : pvalue


    References
    ----------
    Cressie, Noel  and Timothy R. C. Read, Multinomial Goodness-of-Fit Tests,
        Journal of the Royal Statistical Society. Series B (Methodological),
        Vol. 46, No. 3 (1984), pp. 440-464

    Campbell B. Read: Freeman-Tukey chi-squared goodness-of-fit statistics,
        Statistics & Probability Letters 18 (1993) 271-278

    Nobuhiro Taneichi, Yuri Sekiya, Akio Suzukawa, Asymptotic Approximations
        for the Distributions of the Multinomial Goodness-of-Fit Statistics
        under Local Alternatives, Journal of Multivariate Analysis 81, 335?359 (2002)
    Steele, M. 1,2, C. Hurst 3 and J. Chaseling, Simulated Power of Discrete
        Goodness-of-Fit Tests for Likert Type Data

    Examples
    --------

    >>> observed = np.array([ 2.,  4.,  2.,  1.,  1.])
    >>> expected = np.array([ 0.2,  0.2,  0.2,  0.2,  0.2])

    for checking correct dimension with multiple series

    >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected, lambd='freeman_tukey',axis=1)
    (array([[ 2.745166,  2.745166]]), array([[ 0.6013346,  0.6013346]]))
    >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected,axis=1)
    (array([[ 2.77258872,  2.77258872]]), array([[ 0.59657359,  0.59657359]]))
    >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected, lambd=0,axis=1)
    (array([[ 2.77258872,  2.77258872]]), array([[ 0.59657359,  0.59657359]]))
    >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected, lambd=1,axis=1)
    (array([[ 3.,  3.]]), array([[ 0.5578254,  0.5578254]]))
    >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected, lambd=2/3.0,axis=1)
    (array([[ 2.89714546,  2.89714546]]), array([[ 0.57518277,  0.57518277]]))
    >>> powerdiscrepancy(np.column_stack((observed,observed)).T, expected, lambd=2/3.0,axis=1)
    (array([[ 2.89714546,  2.89714546]]), array([[ 0.57518277,  0.57518277]]))
    >>> powerdiscrepancy(np.column_stack((observed,observed)), expected, lambd=2/3.0, axis=0)
    (array([[ 2.89714546,  2.89714546]]), array([[ 0.57518277,  0.57518277]]))

    each random variable can have different total count/sum

    >>> powerdiscrepancy(np.column_stack((observed,2*observed)), expected, lambd=2/3.0, axis=0)
    (array([[ 2.89714546,  5.79429093]]), array([[ 0.57518277,  0.21504648]]))
    >>> powerdiscrepancy(np.column_stack((observed,2*observed)), 10*expected, lambd=2/3.0, axis=0)
    Traceback (most recent call last):
      ...
    ValueError: observed and expected need to have the samenumber of observations, or e needs to add to 1
    >>> powerdiscrepancy(np.column_stack((2*observed,2*observed)), expected, lambd=2/3.0, axis=0)
    (array([[ 5.79429093,  5.79429093]]), array([[ 0.21504648,  0.21504648]]))
    >>> powerdiscrepancy(np.column_stack((2*observed,2*observed)), 20*expected, lambd=2/3.0, axis=0)
    (array([[ 5.79429093,  5.79429093]]), array([[ 0.21504648,  0.21504648]]))
    >>> powerdiscrepancy(np.column_stack((observed,2*observed)), np.column_stack((10*expected,20*expected)), lambd=2/3.0, axis=0)
    (array([[ 2.89714546,  5.79429093]]), array([[ 0.57518277,  0.21504648]]))
    >>> powerdiscrepancy(np.column_stack((observed,2*observed)), np.column_stack((10*expected,20*expected)), lambd=-1, axis=0)
    (array([[ 2.77258872,  5.54517744]]), array([[ 0.59657359,  0.2357868 ]]))


    """
    o = np.array(o)
    e = np.array(e)

    if np.isfinite(lambd) == True:  # check whether lambd is a number
        a = lambd
    else:
        if   lambd == 'loglikeratio': a = 0
        elif lambd == 'freeman_tukey': a = -0.5
        elif lambd == 'pearson': a = 1
        elif lambd == 'modified_loglikeratio': a = -1
        elif lambd == 'cressie_read': a = 2/3.0
        else:
            raise ValueError, 'lambd has to be a number or one of ' + \
                    'loglikeratio, freeman_tukey, pearson, ' +\
                    'modified_loglikeratio or cressie_read'

    n = np.sum(o, axis=axis)
    nt = n
    if n.size>1:
        n = np.atleast_2d(n)
        if axis == 1:
            nt = n.T     # need both for 2d, n and nt for broadcasting
        if e.ndim == 1:
            e = np.atleast_2d(e)
            if axis == 0:
                e = e.T

    if np.all(np.sum(e, axis=axis) == n):
        p = e/(1.0*nt)
    elif np.all(np.sum(e, axis=axis) == 1):
        p = e
        e = nt * e
    else:
        raise ValueError, 'observed and expected need to have the same' \
                          'number of observations, or e needs to add to 1'
    k = o.shape[axis]
    if e.shape[axis] != k:
        raise ValueError, 'observed and expected need to have the same' \
                          'number of bins'

    # Note: taken from formulas, to simplify cancel n
    if a == 0:   # log likelihood ratio
        D_obs = 2*n * np.sum(o/(1.0*nt) * np.log(o/e), axis=axis)
    elif a == -1:  # modified log likelihood ratio
        D_obs = 2*n * np.sum(e/(1.0*nt) * np.log(e/o), axis=axis)
    else:
        D_obs = 2*n/a/(a+1) * np.sum(o/(1.0*nt) * ((o/e)**a - 1), axis=axis)

    return D_obs, stats.chi2.sf(D_obs,k-1)


def itemfreq2d(a, scores=None, axis=0):
    # JP: from scipy.stats.stats.py with changes
    """Returns a 2D array of item frequencies.

    Column 1 contains item values, column 2 to n contain their respective counts.

    Parameters
    ----------
    a : array 1D or 2D
        data, either columns or rows represent variables, see describtion
        for axis
    scores : array_like (optional)
        Contains list or array of items for which frequencies are found. If
        scores is None, then the frequency is found for those items which occur
        at least once. If scores are given, then also items that do not occur
        in the data are listed in the frequency table
    axis : 0, 1 or None
        If axis = 0, then the frequency count is calculated for each column.
        If axis = 1, then the frequency count is calculated for each row.
        If axis = None, then the frequency count is calculated for the entire
            data, (array is raveled first).

    Returns
    -------
    A 2D frequency table (col [0]=scores, col [1:n]=frequencies)

    >>> rvs = np.array([[4, 6, 3], [3, 6, 4], [4, 7, 6], [6, 1, 6]])
    >>> itemfreq2d(rvs,range(1,8),axis=1)
    array([[ 1.,  0.,  0.,  0.,  1.],
           [ 2.,  0.,  0.,  0.,  0.],
           [ 3.,  1.,  1.,  0.,  0.],
           [ 4.,  1.,  1.,  1.,  0.],
           [ 5.,  0.,  0.,  0.,  0.],
           [ 6.,  1.,  1.,  1.,  2.],
           [ 7.,  0.,  0.,  1.,  0.]])
    >>> itemfreq2d(rvs,range(1,8),axis=0)
    array([[ 1.,  0.,  1.,  0.],
           [ 2.,  0.,  0.,  0.],
           [ 3.,  1.,  0.,  1.],
           [ 4.,  2.,  0.,  1.],
           [ 5.,  0.,  0.,  0.],
           [ 6.,  1.,  2.,  2.],
           [ 7.,  0.,  1.,  0.]])
    >>> itemfreq2d(rvs,axis=1)
    array([[ 1.,  0.,  0.,  0.,  1.],
           [ 3.,  1.,  1.,  0.,  0.],
           [ 4.,  1.,  1.,  1.,  0.],
           [ 6.,  1.,  1.,  1.,  2.],
           [ 7.,  0.,  0.,  1.,  0.]])
    >>> itemfreq2d(rvs,axis=0)
    array([[ 1.,  0.,  1.,  0.],
           [ 3.,  1.,  0.,  1.],
           [ 4.,  2.,  0.,  1.],
           [ 6.,  1.,  2.,  2.],
           [ 7.,  0.,  1.,  0.]])
    >>> itemfreq2d(rvs[:,0],range(1,8))
    array([[ 1.,  0.],
           [ 2.,  0.],
           [ 3.,  1.],
           [ 4.,  2.],
           [ 5.,  0.],
           [ 6.,  1.],
           [ 7.,  0.]])
    >>> itemfreq2d(rvs[:,0])
    array([[ 3.,  1.],
           [ 4.,  2.],
           [ 6.,  1.]])
    >>> itemfreq2d(rvs,axis=None)
    array([[ 1.,  1.],
           [ 3.,  2.],
           [ 4.,  3.],
           [ 6.,  5.],
           [ 7.,  1.]])
    >>> itemfreq2d(rvs,range(1,8),axis=None)
    array([[ 1.,  1.],
           [ 2.,  0.],
           [ 3.,  2.],
           [ 4.,  3.],
           [ 5.,  0.],
           [ 6.,  5.],
           [ 7.,  1.]])
    """
    if axis is None:
        a = a.ravel()
        axis = 0
    if a.ndim == 1:
        k = 1
    elif a.ndim == 2:
        k = a.shape[1-axis]
    if scores is None:
        #scores = stats._support.unique(a.ravel())
        scores = np.unique(a.ravel())
        scores = np.sort(scores)
    freq = np.zeros((len(scores),k))
    for i in range(len(scores)):
        freq[i] = np.add.reduce(np.equal(a,scores[i]),axis=axis)
    #return np.array(stats._support.abut(scores, freq))
    return np.array(np.column_stack((scores, freq)))


if __name__ == '__main__':
    import doctest
    doctest.testmod(verbose=True)

    #rvsunif = stats.randint.rvs(1,5,size=10)
    rvs0 = np.array([3, 2, 5, 2, 1, 2, 4, 3, 2, 1])
    # get requencies with old version
    freq = {};
    for s in range(1,6): freq[s]=0
    freq.update(stats.itemfreq(rvs0))
    # get requencies with new version
    observed = itemfreq2d(rvs0,range(1,6),axis=0)[:,1]
    expected = np.ones(5)/5.0

    print powerdiscrepancy(observed, expected, lambd=0)
    print 'the next two are identical Pearson chisquare'
    print powerdiscrepancy(observed, expected, lambd=1)
    print stats.chisquare(observed, 10*expected)
    assert_array_almost_equal(powerdiscrepancy(observed, expected, lambd=1),
                        stats.chisquare(observed, 10*expected))
    print 'the next two are identical freeman_tukey'
    print powerdiscrepancy(observed, expected, lambd='freeman_tukey')
    print discrepancy(observed, observed, 10*expected)[0]*4
    assert_array_almost_equal(powerdiscrepancy(observed, expected, lambd='freeman_tukey')[0],
                        discrepancy(observed, observed, 10*expected)[0]*4)
    powerdiscrepancy(np.column_stack((observed,observed)), 10*expected[:,np.newaxis], lambd='freeman_tukey')
    powerdiscrepancy(np.column_stack((observed,2*observed)), np.column_stack((10*expected,20*expected)), lambd=-1, axis=0)
    #print powerdiscrepancy(np.array(sorted(freq.items())), [1/4.0]*4, lambd=0)

##    print powerdiscrepancy(np.array(sorted(freq.items()))[:,1], np.asarray([1/4.0]*4), lambd=0)
##    print 'the next two are identical Pearson chisquare'
##    print powerdiscrepancy(np.array(sorted(freq.items()))[:,1], np.asarray([1/4.0]*4), lambd=1)
##    print stats.chisquare(np.array(sorted(freq.items()))[:,1], 10*np.asarray([1/4.0]*4))
##    print 'the next two are identical freeman_tukey'
##    print powerdiscrepancy(np.array(sorted(freq.items()))[:,1], np.asarray([1/4.0]*4), lambd='freeman_tukey')
##    print discrepancy(np.array(sorted(freq.items()))[:,1],np.array(sorted(freq.items()))[:,1], 10*np.asarray([1/4.0]*4) )[0]*4



More information about the SciPy-user mailing list