In diesem Script geht es um lineare Regressionen mit mehreren Prediktoren. Angelehnt ist das Script an Ledolter, Johannes. 2013. Business analytics and data mining with R. Hoboken, New Jersey: Wiley.
Die Daten sind dieselben wie schon in "Know Your Data".
Die Daten sind dieselben wie schon in "Know Your Data".
df = read.csv("recs2009_public.csv")
meta = read.csv("public_layout.csv")
dfm = subset(df, select = c("AIA_Zone","YEARMADE","TOTROOMS", "AGERFRI1", "WASHLOAD", "TVCOLOR", "NUMPC", "EQUIPAGE", "HHSEX", "HHAGE", "MONEYPY", "KWH"))
# example plots
plot(KWH~TOTROOMS, data=dfm)
plot(KWH~WASHLOAD, data=dfm)
# linear regression with all predictors
fit = lm(KWH ~., data=dfm)
summary(fit)
# predictors are correlated
cor(dfm)
# pairs(dfm)
# calculate all possible models
# requires "leaps"
library(leaps)
X = dfm[,1:11]
y = dfm[,12]
out = summary(regsubsets(X, y, nbest=2, nvmax=ncol(X)))
tab = cbind(out$which, out$rsq, out$adjr2, out$cp)
tab
# cp = Mallow's Cp-statistic
# "If a model with p<k regressors is already adequate, its values of
# the Cp-statistic should be about p+1.It is larger (usually quite a
# bit larger) than p+1 if a model with p regressors cannot explain the
# relationship." (Ledolter 2013: 42)
dfm.select = dfm[,c(1,2,3,5,6,7,10,12)]
fit2 = lm(KWH ~., data=dfm.select)
summary(fit2)
par(mfrow=c(2,2))
plot(fit2)
Kommentare
Kommentar veröffentlichen