# [SciPy-user] Multiple regression

John Hunter jdhunter at ace.bsd.uchicago.edu
Thu Apr 22 12:01:26 CDT 2004

>>>>> "Jean-Baptiste" == Jean-Baptiste Cazier <jcazier at decode.is> writes:

Jean-Baptiste> Sæl !  I saw in some post in comp.lang.python that
Jean-Baptiste> Scipy is supposed to have multiple regression tools
Jean-Baptiste> included However when I look at the stat_test I see
Jean-Baptiste> that it is not the case.

Jean-Baptiste> Am I looking at the wrong place ?

Jean-Baptiste> Could anybody please point me out to a multiple
Jean-Baptiste> regression module in python ?

Here is the algorithm I use, which I implemented from the discussion
at mathworld.  I validated this against matlab's regress function.

# fcdf is in different places in different scipy versions
try: from scipy.stats import fcdf
except ImportError:
from scipy.stats import f
fcdf = f.cdf

from Numeric import matrixmultiply, ones, Float, divide, transpose, reshape,\
array, sqrt
from LinearAlgebra import linear_least_squares, inverse
from MLab import randn, mean, sum, cov

from Matrix import Matrix

def regress(X,y):
"""
Perform a multiple linear regression of y onto X.  X is a num
observations by (num variables +1) array.  X should contain a
column of ones to generate the intercept.  y is a num observations
array of independent variables

return value B, residuals, stats

B: the regression coeffients;  Ypred = B.T * X.T
residuals: array( y - Ypred.T)
stats = Rsquared, F, p

"""

# regression coeffs are given by (Xt*X)-1*Xt*y
N = X.shape[0]
y.shape = N, 1
X = Matrix(X)
Y = Matrix(y)
Xt = X.T
Xt_X_i = Matrix(inverse(Xt*X) )
B = Xt_X_i*Xt*Y

Ypred = B.T * Xt
residuals = array(Y-Ypred.T)
CF = N*mean(y)**2     # correction factor

SStotal = float(Y.T*Y-CF)
SSregress =  float(B.T * Xt * Y - CF)
SSerror =  SStotal - SSregress

Rsquared = SSregress/SStotal

dfTotal = N-1
dfRegress = len(B)-1
dfError = dfTotal - dfRegress

F = SSregress/dfRegress / (SSerror/dfError)
prob = 1-fcdf(F, dfRegress, dfError)

stats = Rsquared, F, prob
return B, residuals, stats

fhx = file('independent.dat', 'w')
fhy = file('dependent.dat', 'w')

# X is number observations by (number vars + constant)
X = randn( 200,6 )

# The last column should be ones for the constant term
X[:,5] = ones((200,), typecode=Float)

y = randn( 200 )   # the dependent var

#save the data to file for comparison with matlab's regress
for row in X: print >>fhx, ' '.join([str(val) for val in row])
print >>fhy, '\n'.join([str(val) for val in y])
fhx.close()
fhy.close()

# c are the coeffiecients
# ss is the sum of squared residuals
# sv are the singular values of X
c, ss, rank, sv = linear_least_squares(X,y)

# below is just a demo to show how the results can be used to generate
# a prediction
p =  matrixmultiply(X,c)
ss1 = sum( (y-p)**2) # should equal 'ss'

B, resid, stats = regress(X,y)
print stats  # validated against multiple_regress_demo.m