[SciPy-user] Multiple regression
John Hunter
jdhunter at ace.bsd.uchicago.edu
Thu Apr 22 12:01:26 CDT 2004
>>>>> "Jean-Baptiste" == Jean-Baptiste Cazier <jcazier at decode.is> writes:
Jean-Baptiste> Sæl ! I saw in some post in comp.lang.python that
Jean-Baptiste> Scipy is supposed to have multiple regression tools
Jean-Baptiste> included However when I look at the stat_test I see
Jean-Baptiste> that it is not the case.
Jean-Baptiste> Am I looking at the wrong place ?
Jean-Baptiste> Could anybody please point me out to a multiple
Jean-Baptiste> regression module in python ?
Here is the algorithm I use, which I implemented from the discussion
at mathworld. I validated this against matlab's regress function.
# fcdf is in different places in different scipy versions
try: from scipy.stats import fcdf
except ImportError:
from scipy.stats import f
fcdf = f.cdf
from Numeric import matrixmultiply, ones, Float, divide, transpose, reshape,\
array, sqrt
from LinearAlgebra import linear_least_squares, inverse
from MLab import randn, mean, sum, cov
from Matrix import Matrix
def regress(X,y):
"""
Perform a multiple linear regression of y onto X. X is a num
observations by (num variables +1) array. X should contain a
column of ones to generate the intercept. y is a num observations
array of independent variables
return value B, residuals, stats
B: the regression coeffients; Ypred = B.T * X.T
residuals: array( y - Ypred.T)
stats = Rsquared, F, p
"""
# regression coeffs are given by (Xt*X)-1*Xt*y
N = X.shape[0]
y.shape = N, 1
X = Matrix(X)
Y = Matrix(y)
Xt = X.T
Xt_X_i = Matrix(inverse(Xt*X) )
B = Xt_X_i*Xt*Y
Ypred = B.T * Xt
residuals = array(Y-Ypred.T)
CF = N*mean(y)**2 # correction factor
SStotal = float(Y.T*Y-CF)
SSregress = float(B.T * Xt * Y - CF)
SSerror = SStotal - SSregress
Rsquared = SSregress/SStotal
dfTotal = N-1
dfRegress = len(B)-1
dfError = dfTotal - dfRegress
F = SSregress/dfRegress / (SSerror/dfError)
prob = 1-fcdf(F, dfRegress, dfError)
stats = Rsquared, F, prob
return B, residuals, stats
fhx = file('independent.dat', 'w')
fhy = file('dependent.dat', 'w')
# X is number observations by (number vars + constant)
X = randn( 200,6 )
# The last column should be ones for the constant term
X[:,5] = ones((200,), typecode=Float)
y = randn( 200 ) # the dependent var
#save the data to file for comparison with matlab's regress
for row in X: print >>fhx, ' '.join([str(val) for val in row])
print >>fhy, '\n'.join([str(val) for val in y])
fhx.close()
fhy.close()
# c are the coeffiecients
# ss is the sum of squared residuals
# sv are the singular values of X
c, ss, rank, sv = linear_least_squares(X,y)
# below is just a demo to show how the results can be used to generate
# a prediction
p = matrixmultiply(X,c)
ss1 = sum( (y-p)**2) # should equal 'ss'
B, resid, stats = regress(X,y)
print stats # validated against multiple_regress_demo.m
More information about the SciPy-user
mailing list