[SciPy-user] Multiple regression

John Hunter jdhunter at ace.bsd.uchicago.edu
Thu Apr 22 12:01:26 CDT 2004


>>>>> "Jean-Baptiste" == Jean-Baptiste Cazier <jcazier at decode.is> writes:

    Jean-Baptiste> Sæl !  I saw in some post in comp.lang.python that
    Jean-Baptiste> Scipy is supposed to have multiple regression tools
    Jean-Baptiste> included However when I look at the stat_test I see
    Jean-Baptiste> that it is not the case.

    Jean-Baptiste> Am I looking at the wrong place ?

    Jean-Baptiste> Could anybody please point me out to a multiple
    Jean-Baptiste> regression module in python ?

Here is the algorithm I use, which I implemented from the discussion
at mathworld.  I validated this against matlab's regress function.

# fcdf is in different places in different scipy versions
try: from scipy.stats import fcdf
except ImportError:
    from scipy.stats import f
    fcdf = f.cdf

from Numeric import matrixmultiply, ones, Float, divide, transpose, reshape,\
     array, sqrt
from LinearAlgebra import linear_least_squares, inverse
from MLab import randn, mean, sum, cov



from Matrix import Matrix

def regress(X,y):
    """
    Perform a multiple linear regression of y onto X.  X is a num
    observations by (num variables +1) array.  X should contain a
    column of ones to generate the intercept.  y is a num observations
    array of independent variables

    return value B, residuals, stats

    B: the regression coeffients;  Ypred = B.T * X.T
    residuals: array( y - Ypred.T)
    stats = Rsquared, F, p
    
    """

    # regression coeffs are given by (Xt*X)-1*Xt*y
    N = X.shape[0]
    y.shape = N, 1
    X = Matrix(X)
    Y = Matrix(y)
    Xt = X.T
    Xt_X_i = Matrix(inverse(Xt*X) )
    B = Xt_X_i*Xt*Y

    Ypred = B.T * Xt
    residuals = array(Y-Ypred.T)
    CF = N*mean(y)**2     # correction factor

    SStotal = float(Y.T*Y-CF)
    SSregress =  float(B.T * Xt * Y - CF)
    SSerror =  SStotal - SSregress

    Rsquared = SSregress/SStotal

    dfTotal = N-1
    dfRegress = len(B)-1
    dfError = dfTotal - dfRegress

    F = SSregress/dfRegress / (SSerror/dfError)
    prob = 1-fcdf(F, dfRegress, dfError)

    stats = Rsquared, F, prob
    return B, residuals, stats


    

fhx = file('independent.dat', 'w')
fhy = file('dependent.dat', 'w')

# X is number observations by (number vars + constant)
X = randn( 200,6 )    

# The last column should be ones for the constant term
X[:,5] = ones((200,), typecode=Float)

y = randn( 200 )   # the dependent var

#save the data to file for comparison with matlab's regress
for row in X: print >>fhx, ' '.join([str(val) for val in row])
print >>fhy, '\n'.join([str(val) for val in y])
fhx.close()
fhy.close()

# c are the coeffiecients
# ss is the sum of squared residuals
# sv are the singular values of X
c, ss, rank, sv = linear_least_squares(X,y)

# below is just a demo to show how the results can be used to generate
# a prediction
p =  matrixmultiply(X,c)
ss1 = sum( (y-p)**2) # should equal 'ss'


B, resid, stats = regress(X,y)
print stats  # validated against multiple_regress_demo.m



More information about the SciPy-user mailing list