# [SciPy-dev] An interesting exercise - reproduce R analysis using python.

josef.pktd@gmai... josef.pktd@gmai...
Tue Jan 27 14:14:02 CST 2009

```On Tue, Jan 27, 2009 at 5:45 AM, Alex Holcombe <alexh@psych.usyd.edu.au> wrote:
>  <josef.pktd <at> gmail.com> writes:
>
>>
>> Your code is stylisticaly pretty difficult to read, for someone used
>> to python code. I recommend giving a look at
>> http://www.python.org/dev/peps/pep-0008/. Also I had problems
>> following all the (temporary) variables, so I tried to come up with
>> something that is easier to read.
>>
>
> Wow, your versions are all much nicer than mine, thank you!  If anyone wants to
> use these (as I do), note that in version 1 and 1b you'd need to substitute in
> 'keep' for '[1,4]'.
> I would consider this code to be basic functionality for any data analysis
> package.
>

I looked up the explanation for the pivot table, and I rewrote my
version 1b to optionally produce a table for each statistic instead of
a flat table. I added a flat2nd function,  flat to multidimensional
conversion function. see attachment.

I don't know what all the uses for this are, but this could be
extended to include a callback function to calculate arbitrary
statistics on the data for the unique factors.

See if this version works as you would expect a pivot table to work.

Josef
-------------- next part --------------
# only tested for 2D, i.e. 2 explanatory variables
# no pretty print

import numpy as np
from numpy.testing import assert_array_equal, assert_equal

def ptable(data, dv, keep, outformat='flat'):
'''calculate basic statistics for pivot table

Mean, standard deviation and count for a dependent variable
conditional on some explanatory variables while ignoring other
explanatory variables.

This works only for discrete values of explanatory variables

Parameters
----------
data : 2D array
assumes variables are in columns and observations in rows
dv : int
column index of dependent variable
keep : array_like int
column indices of explanatory variables
outformat : (optional)
* 'flat' (default) :
return 2D array with unique values of explanatory variables
in first columns and statistics in later columns
* 'table'

Returns
-------
statarr: 2D array  if outformat = 'flat'
{uns, mmean, mstd, mcount} if outformat = 'table'

'''
# build dictionary with unique combination as keys
#   and corresponding row indices as values
catdata = {}
for index, row in enumerate(data):
catdata.setdefault(tuple(row[keep]),[]).append(index)

# calculate statistics for each combination (key)
stat = []
for k,v in sorted(catdata.iteritems()):
m = data[v,dv].mean()
s = data[v,dv].std()
stat.append(list(k) + [m, s, len(v)])

# convert result statistic to numpy arrays
statarr = np.array(stat)

if outformat == 'flat':
return statarr
elif outformat == 'table':
# convert flat table to multidimensional
K = len(keep)
mmean, uns = flat2multi(statarr[:,range(K)+[K]])
mstd, uns = flat2multi(statarr[:,range(K)+[K+1]])
mcount, uns = flat2multi(statarr[:,range(K)+[K+2]])
return uns, mmean, mstd, mcount
else:
raise ValueError, "outformat can only be 'flat' or 'table'"

def flat2nd(x):
'''convert flat table to multidimensional table

Assumes rows on first K columns are jointly unique.
Flat table does not need to have complete, i.e. rectangular, values
for explanatory variables. Missing elements are filled with NaN.

Parameters
----------
x array (N,K+1)
flat table [x1,x2,y]

returns
-------
res : array
contains variable of last column in input reformated to have
K dimensions with rows and columns according to unique

uns: list of K 1D arrays
element i of uns is 1D array of values of the explanatory variable
for the ith axis of `res`

Example
-------
>>> mex = np.array([[ 11.,   1.,   1.],
[ 11.,   2.,   2.],
[ 12.,   1.,   3.],
[ 12.,   2.,   4.]])
>>> res, unirs, uns = flat2nd(mex)
>>> res
array([[ 1.,  2.],
[ 3.,  4.]])
>>> uns
[array([ 11.,  12.]), array([ 1.,  2.])]

example with unequal dimension and not rectangular

>>> mex = np.array([[ 11.,   1.,   1.],
[ 11.,   2.,   2.],
[ 12.,   1.,   3.],
[ 12.,   2.,   4.],
[ 13.,   2.,   5.],])
>>> res, unirs, uns = flat2nd(mex)
>>> res
array([[  1.,   2.],
[  3.,   4.],
[ NaN,   5.]])
>>> uns
[array([ 11.,  12.,  13.]), array([ 1.,  2.])]
'''
uns = []
unirs = []
dims = []
for ii in range(x.shape[1]-1):
un, unir = np.unique1d(x[:,ii], return_inverse=True)
uns.append(un)
unirs.append(unir)
dims.append(len(un))

res = np.nan * np.ones(dims)
res[zip(unirs)]=x[:,-1]
return res, uns

def test_flat2multi():
mex = np.array([[ 11.,   1.,   1.],
[ 11.,   2.,   2.],
[ 12.,   1.,   3.],
[ 12.,   2.,   4.]])
res, uns = flat2nd(mex)
assert_array_equal(res, np.array([[ 1.,  2.], [ 3.,  4.]]))
assert_equal(uns, [np.array([ 11.,  12.]), np.array([ 1.,  2.])])

if __name__ == '__main__':
test_flat2multi()

data = np.random.randint(1,3, size=(10,5))
data[:,1] += 10
keep = [1, 4]     # index in data of explanatory variable under consideration
dv = 0            # index in data of dependent variable
statn = ptable(data, dv, keep, outformat='flat')
print statn
uns, mmean, mstd, mcount = ptable(data, dv, keep, outformat='table')
print uns
print mmean
print mstd
print mcount

mex = np.array([[ 11.,   1.,   1.],
[ 11.,   2.,   2.],
[ 12.,   1.,   3.],
[ 12.,   2.,   4.],
[ 13.,   2.,   5.],])
res, uns = flat2nd(mex)
print uns
print res
```