[SciPy-user] Ols for np.arrays and masked arrays

josef.pktd@gmai... josef.pktd@gmai...
Fri Jan 16 10:52:58 CST 2009


I tried to see how masked arrays and nans, and a sequence of input
arguments, can be handled in a regression (linear model)

The attached file works,  but array dimension handling and
concatenation looks pretty messy, if we want column orientation
instead of rows. I left some of the unsuccessful attempts as comments,
if someone can propose a better way. Also this is the first time, that
I use masked arrays, and I'm not sure I found the best way. At the end
of the file are my test cases.

I treat masked arrays or nans by removing all observations with masked
or nan values for the internal calculation, but keep the mask around,
for the case when it is needed for some output. I looked at
np.ma.polyfit, which uses a dummy fill value (0) before calling least
squares, but in general it will be difficult to find "neutral" fill
values..

Does this look like a reasonable way to write functions or classes
that can handle plain np.arrays and masked arrays at the same time
with minimal overhead? I haven't looked at extending it to record,
structured arrays.

Josef

class Regression(object):
    def __init__(self,y,*args,**kwds):
        '''
        Parameters
        ----------

        y: array, 1D or 2D
            variable of regression. if 2D,  it needs to be one column
        args: sequence of 1D or 2D arrays
            one or several arrays, if 2D than interpretation is that each
            column represents one variable and rows are observations
        kwds: addconst (default: True)
            if True then a constant is added to the regressor

        return
        ------
        class instance with regression results

        Notes
        -----

        Observation (rows) that contain masked values in a masked array or nans
        in any of the regressors or in y will be dropped for the calculation.
        Arrays that correspond to observation, e.g. estimated y (yhat)
        or residuals, are returned as masked arrays if masked values or nans are
        in the input data.

        Usage
        -----

        estm = Ols(y, x1, x2)
        estm.b  # estimated parameter vector

        example polynomial
        estv = Ols(y, np.vander(x[:,0],3), x[:,1], addconst=False)
        estv.b  # estimated parameter vector
        estv.yhat # fitted values
-------------- next part --------------
import numpy as np
import numpy.ma as ma
from scipy import linalg
#from numpy.testing import assert_almost_equal
from numpy.ma.testutils import assert_almost_equal


def compressmiss(y,design,axis=0):
    '''compress rows with any masked value'''
    #print 'ma.mask_rows(design).mask', ma.mask_rows(design).mask.shape
    
    mask = ma.mask_or(ma.getmask(y), ma.mask_rows(design).mask[:,:1]).ravel()
    ok = (mask == False)
    #print ok, mask
    #print 'mask.shape', mask.shape, ok.shape
    #print 'y.shape', y.shape
    #print 'design.shape',design.shape
    #return y[ok,:], design[ok,:], mask   #note this does not convert to nd.array
    
    y, design = y[ok,0], design[ok,:]  # y[ok,0] use 0 for ma.compressed
    #print 'y.shape', y.shape, 'design.shape', design.shape
    #y = ma.masked_array(y, mask)
    #design = ma.masked_array(design, mask)
    
    #ma.compressed doesn't preserve shape (orientation) 
    return ma.compressed(y)[:,np.newaxis], ma.compress_rows(design), mask
    #return ma.compressed(y), ma.compress_rows(design), mask

    

class Regression(object):
    def __init__(self,y,*args,**kwds):
        '''
        Parameters
        ----------

        y: array, 1D or 2D
            variable of regression. If 2D, then it needs to be one column
        args: sequence of 1D or 2D arrays
            one or several arrays of regressors. If 2D than interpretation
            is that each column represents one variable and rows are
            observations
        kwds: addconst (default: True)
            if True then a constant is added to the regressor

        return
        ------
        class instance with regression results

        Notes
        -----

        Observation (rows) that contain masked values in a masked array or nans
        in any of the regressors or in y will be dropped for the calculation.
        Arrays that correspond to observation, e.g. estimated y (yhat)
        or residuals, are returned as masked arrays if masked values or nans are
        in the input data.

        Usage
        -----
        estm = Ols(y, x1, x2)
        estm.b  # estimated parameter vector
        
        example polynomial
        estv = Ols(y, np.vander(x[:,0],3), x[:,1], addconst=False)
        estv.b  # estimated parameter vector
        estv.yhat # fitted values
        
        '''
        #print 'init of class Regression'   
        self.addconst = kwds.pop('addconst', True)
        #print y.shape
##        for v in args:
##            print v.shape   
        if self.addconst:
            #design = np.concatenate( tuple([np.ones(y.shape)] + list(args)),axis=1)
            #design = np.concatenate( [np.ones(y.shape).T] + [it.T for it in args],axis=0).T
            #design = np.c_[ [np.ones(y.shape)] + list(args)]
            
            #simple to read but maybe inefficient intermediate matrices
            design = np.c_[args] # note: this encodes mask as nan
            design = np.c_[np.ones(y.shape), design]
            #print design.shape
        else:
            design = np.c_[args]
        if isinstance(y,ma.MaskedArray) or isinstance(design,ma.MaskedArray) \
            or np.any(np.isnan(y)) or np.any(np.isnan(design)):
            design = ma.masked_array(design, np.isnan(design))
            
            self.ymasked = y #keep around for simplicity
            self.y, self.design, self.mask = compressmiss(y,design)
            self.masked = True #not necessary
            self.ymasked = ma.masked_array(y, self.mask) # update mask ?
            #print self.design      
        else:
            self.y = y
            self.design = design
            self.mask = None

        #print 'y x shapes before estimate', self.y.shape, self.design.shape
        self.estimate()
        #print 'y x b shapes before yhat', self.y.shape, self.design.shape, self.b.shape
        yhat = np.dot(self.design,self.b)
        if not self.mask == None:
            self.yhat = self.ymasked.copy() # just to remember shape
            self.yhat[self.mask == False] = yhat
        else:
            self.yhat = yhat

    def estimate(self):
        pass

    def get_yhat(self):
        pass
        

    def summary(self):
        pass

    def predict(self, x):
        pass


class Ols(Regression):
    def __init__(self,y,*args,**kwds):
        super(self.__class__, self).__init__(y,*args,**kwds)

    def estimate(self):
        y,x = self.y, self.design
        #print 'y.shape, x.shape'
        #print y.shape, x.shape
        self.b,self.resid,rank,self.sigma = linalg.lstsq(x,y)
                
        #print rank

    def predict(self, x):
        '''redurn prediction
        todo: add prediction error, confidence intervall'''
        if self.addconst:
            x  = np.c_[np.ones(x.shape[0]),x]
        #print x.shape, self.b.shape
        return np.dot(x,self.b)



if __name__ == '__main__':

    import numpy as np
    #from olsexample import ols

    def generate_data(nobs):
       x = np.random.randn(nobs,2)
       btrue = np.array([[5,1,2]]).T
       y = np.dot(x, btrue[1:,:]) + btrue[0,:] + 0.5 * np.random.randn(nobs,1)
       return y,x

    y,x = generate_data(15)

    #benchmark no masked arrays, and one 2D array for x
    est = Ols(y[1:,:],x[1:,:])  # initialize and estimate with ols, constant added by default
    print 'ols estimate'
    est.estimate()
    print est.b.T
    #print np.array([[5,1,2]])  # true coefficients

    ynew,xnew = generate_data(3)
    ypred = est.predict(xnew)

    print '    ytrue        ypred        error'
    print np.c_[ynew, ypred, ynew - ypred]


    #case masked array y
    ym = y.copy()
    ym[0,:] = np.nan
    ym = ma.masked_array(ym, np.isnan(ym))
    estm1 = Ols(ym,x)
    print estm1.b.T
    print estm1.yhat.shape
    print 'yhat'
    print estm1.yhat[:10,:]
    assert_almost_equal(estm1.yhat[1:,:], est.yhat)

    #masked y and several x args, addconst=False
    estm2 = Ols(ym,np.ones(ym.shape),x[:,0],x[:,1],addconst=False)
    print estm2.b.T
    assert_almost_equal(estm2.b, estm1.b)
    assert_almost_equal(estm2.yhat, estm1.yhat)

    #masked y and several x args, 
    estm3 = Ols(ym,x[:,0],x[:,1])
    print estm2.b.T
    assert_almost_equal(estm3.b, estm1.b)
    assert_almost_equal(estm3.yhat, estm1.yhat)

    #masked array in y and one x variable
    x_0 = x[:,0].copy()  # is copy necessary?
    x_0[0] = np.nan
    x_0 = ma.masked_array(x_0, np.isnan(x_0))
    estm4 = Ols(ym,x_0,x[:,1])
    print estm4.b.T
    assert_almost_equal(estm4.b, estm1.b)
    assert_almost_equal(estm4.yhat, estm1.yhat)

    #masked array in one x variable, but not in y
    x_0 = x[:,0].copy()  # is copy necessary?
    x_0[0] = np.nan
    x_0 = ma.masked_array(x_0, np.isnan(x_0))
    estm5 = Ols(y,x_0,x[:,1])   #, addconst=False)
    print estm5.b.T
    assert_almost_equal(estm5.b, estm1.b)
    assert_almost_equal(estm5.yhat, estm1.yhat) 
    #assert np.all(estm5.yhat == estm1.yhat)

    #example polynomial
    print 'example with one polynomial x added'
    estv = Ols(y,np.vander(x[:,0],3), x[:,1], addconst=False)
    print estv.b.T
    print estv.yhat


More information about the SciPy-user mailing list