[SciPy-dev] An interesting exercise - reproduce R analysis using python.

josef.pktd@gmai... josef.pktd@gmai...
Tue Jan 27 14:14:02 CST 2009


On Tue, Jan 27, 2009 at 5:45 AM, Alex Holcombe <alexh@psych.usyd.edu.au> wrote:
>  <josef.pktd <at> gmail.com> writes:
>
>>
>> Your code is stylisticaly pretty difficult to read, for someone used
>> to python code. I recommend giving a look at
>> http://www.python.org/dev/peps/pep-0008/. Also I had problems
>> following all the (temporary) variables, so I tried to come up with
>> something that is easier to read.
>>
>
> Wow, your versions are all much nicer than mine, thank you!  If anyone wants to
> use these (as I do), note that in version 1 and 1b you'd need to substitute in
> 'keep' for '[1,4]'.
> I would consider this code to be basic functionality for any data analysis
> package.
>

I looked up the explanation for the pivot table, and I rewrote my
version 1b to optionally produce a table for each statistic instead of
a flat table. I added a flat2nd function,  flat to multidimensional
conversion function. see attachment.

I don't know what all the uses for this are, but this could be
extended to include a callback function to calculate arbitrary
statistics on the data for the unique factors.

See if this version works as you would expect a pivot table to work.

Josef
-------------- next part --------------
# only tested for 2D, i.e. 2 explanatory variables
# no pretty print


import numpy as np
from numpy.testing import assert_array_equal, assert_equal




def ptable(data, dv, keep, outformat='flat'):
    '''calculate basic statistics for pivot table

    Mean, standard deviation and count for a dependent variable
    conditional on some explanatory variables while ignoring other
    explanatory variables.

    This works only for discrete values of explanatory variables

    Parameters
    ----------
    data : 2D array
        assumes variables are in columns and observations in rows
    dv : int
        column index of dependent variable
    keep : array_like int
        column indices of explanatory variables
    outformat : (optional)
        * 'flat' (default) :
          return 2D array with unique values of explanatory variables
          in first columns and statistics in later columns
        * 'table'
          
    Returns
    -------
    statarr: 2D array  if outformat = 'flat'
    {uns, mmean, mstd, mcount} if outformat = 'table'
    

    '''
    # build dictionary with unique combination as keys
    #   and corresponding row indices as values
    catdata = {}
    for index, row in enumerate(data):
        catdata.setdefault(tuple(row[keep]),[]).append(index)

    # calculate statistics for each combination (key)
    stat = []
    for k,v in sorted(catdata.iteritems()):
        m = data[v,dv].mean()
        s = data[v,dv].std()
        stat.append(list(k) + [m, s, len(v)])

    # convert result statistic to numpy arrays
    statarr = np.array(stat)
    
    if outformat == 'flat':
        return statarr
    elif outformat == 'table':
        # convert flat table to multidimensional
        K = len(keep)
        mmean, uns = flat2multi(statarr[:,range(K)+[K]])
        mstd, uns = flat2multi(statarr[:,range(K)+[K+1]])
        mcount, uns = flat2multi(statarr[:,range(K)+[K+2]])
        return uns, mmean, mstd, mcount
    else:
        raise ValueError, "outformat can only be 'flat' or 'table'"
        


def flat2nd(x):
    '''convert flat table to multidimensional table

    Assumes rows on first K columns are jointly unique.
    Flat table does not need to have complete, i.e. rectangular, values
    for explanatory variables. Missing elements are filled with NaN.

    Parameters
    ----------
    x array (N,K+1)
         flat table [x1,x2,y]

    returns
    -------
    res : array
        contains variable of last column in input reformated to have
        K dimensions with rows and columns according to unique 
        
    uns: list of K 1D arrays
        element i of uns is 1D array of values of the explanatory variable
        for the ith axis of `res`

    Example
    -------
    >>> mex = np.array([[ 11.,   1.,   1.],
                        [ 11.,   2.,   2.],
                        [ 12.,   1.,   3.],
                        [ 12.,   2.,   4.]])
    >>> res, unirs, uns = flat2nd(mex)
    >>> res
    array([[ 1.,  2.],
           [ 3.,  4.]])
    >>> uns
    [array([ 11.,  12.]), array([ 1.,  2.])]

    example with unequal dimension and not rectangular
    
    >>> mex = np.array([[ 11.,   1.,   1.],
                        [ 11.,   2.,   2.],
                        [ 12.,   1.,   3.],
                        [ 12.,   2.,   4.],
                        [ 13.,   2.,   5.],])
    >>> res, unirs, uns = flat2nd(mex)
    >>> res
    array([[  1.,   2.],
           [  3.,   4.],
           [ NaN,   5.]])
    >>> uns
    [array([ 11.,  12.,  13.]), array([ 1.,  2.])]
    '''
    uns = []
    unirs = []
    dims = []
    for ii in range(x.shape[1]-1):        
        un, unir = np.unique1d(x[:,ii], return_inverse=True)
        uns.append(un)
        unirs.append(unir)
        dims.append(len(un))
    
    res = np.nan * np.ones(dims)
    res[zip(unirs)]=x[:,-1]
    return res, uns



def test_flat2multi():
    mex = np.array([[ 11.,   1.,   1.],
                    [ 11.,   2.,   2.],
                    [ 12.,   1.,   3.],
                    [ 12.,   2.,   4.]])
    res, uns = flat2nd(mex)
    assert_array_equal(res, np.array([[ 1.,  2.], [ 3.,  4.]]))
    assert_equal(uns, [np.array([ 11.,  12.]), np.array([ 1.,  2.])])


if __name__ == '__main__':
    test_flat2multi()

    data = np.random.randint(1,3, size=(10,5))
    data[:,1] += 10
    keep = [1, 4]     # index in data of explanatory variable under consideration
    dv = 0            # index in data of dependent variable
    statn = ptable(data, dv, keep, outformat='flat')
    print statn
    uns, mmean, mstd, mcount = ptable(data, dv, keep, outformat='table')
    print uns
    print mmean
    print mstd
    print mcount

    mex = np.array([[ 11.,   1.,   1.],
                    [ 11.,   2.,   2.],
                    [ 12.,   1.,   3.],
                    [ 12.,   2.,   4.],
                    [ 13.,   2.,   5.],])
    res, uns = flat2nd(mex)
    print uns
    print res


More information about the Scipy-dev mailing list