Using
the #CARET package by Max Kuhn, we find out that #randomForest is better than the mean.
During
the annual meeting of the DVPW #methods section, Andreas Graefe argued, the
mean would beat more complex ensemble methods in forecasting. At least, when it
comes to a classical prediction scenario, this is not true.
I
created a simulated data set, defined different models (linear and tree-based)
relying on different sets of information. Then I averaged the results. Against
this “mean-ensemble”, I send a plain and simple #randomForest in the beauty
contest. In addition, I build another model that takes the mean of all models
including the random forest. Finally, I build a model that takes all the
results of all other models as predictors but combines them again with a random
forest.
The
whole procedure was repeated 100 times.
As
we can see in the graphic, the simple random forest beats the mean in every
error type.
For the mean absolute percentage error (MAPE), the linear
model beats the random forest, probably, because there is one very good but
random fit in the sets.
In addition, we can take a look at how often each model had
the lowest MAPE:
Here the meta-random forest is best, followed by the simple
random forest. The mean-models are never(!) the ones with the lowest mean.
Sorry Andreas, the mean is not very good.
But! This is not a forecasting test. In this scenario, we are in a predictive analytics environment, where we have 800 observations to train our models. And we are averaging the errors over 200 predictions for each model. This should be kept in mind.
And here comes the code:
Wed Jun 10 11:23:46 2015
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tree)
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## a few functions:
## mean squared (prediction) error
MSE <- function(y,yhat)
{
mean((y-yhat)**2)
}
## mean absolute (prediction) error
MAE <- function(y,yhat)
{
mean(abs(y-yhat))
}
## mean absolute percentage (prediction) error
MAPE <- function(y,yhat,percent=TRUE)
{
if(percent){
100*mean(abs( (y-yhat)/y ))
} else {
mean(abs( (y-yhat)/y ))
}
}
# initialisze an empty data-frame
dfRes <- data.frame(MSE1=numeric(0),MSE2=numeric(0),MSE3=numeric(0),MSE4=numeric(0),MSE5=numeric(0),MSE6=numeric(0),
MSERF=numeric(0),MSEM=numeric(0),MSEMRF=numeric(0), MSERFm=numeric(0),MSERFrfm=numeric(0),
MAE1=numeric(0),MAE2=numeric(0),MAE3=numeric(0),MAE4=numeric(0),MAE5=numeric(0),MAE6=numeric(0),
MAERF=numeric(0),MAEM=numeric(0),MAEMRF=numeric(0), MAERFm=numeric(0),MAERFrfm=numeric(0),
MAPE1=numeric(0),MAPE2=numeric(0),MAPE3=numeric(0),MAPE4=numeric(0),MAPE5=numeric(0),MAPE6=numeric(0),
MAPERF=numeric(0),MAPEM=numeric(0),MAPEMRF=numeric(0), MAPERFm=numeric(0),MAPERFrfm=numeric(0))
colnames(dfRes) <- c("MSE1","MSE2","MSE3","MSE4","MSE5","MSE6","MSERF", "MSEM","MSEMRF", "MSERFm", "MSERFrfm",
"MAE1","MAE2","MAE3","MAE4","MAE5","MAE6","MAERF", "MAEM","MAEMRF", "MAERFm", "MAERFrfm",
"MAPE1","MAPE2","MAPE3","MAPE4","MAPE5","MAPE6","MAPERF", "MAPEM","MAPEMRF", "MAPERFm", "MAPERFrfm")
# Start the simulation: Everything runs in a for-loop, 100 times
for(i in 1:100){
# SLC14_1 is a function to simulate data sets. Here, we take
# 5 variables with pure noise, 5 correlating variables and add
# autoregression to the correlation
df <- SLC14_1(1000,noiseVars = 5, corrVars = 5, corrType = "AR1")
# split the data in 800 training and 200 test observations
sel <- sample(1000, 800)
tr <- df[sel,]
te <- df[-sel,]
# find the best single predictor
best <- which.max(abs(sapply(tr[,-31], cor, tr$y)))
#model 1: regression, 1 good variable (not the best), randomly selected
vars <- c(1:20)[-best]
varSel <- sample(vars,1)
fit1Tr <- tr[,c(31,varSel)]
colnames(fit1Tr)[2]<- "x"
fit1 <- lm(y~x, data=fit1Tr)
## extract point forecasts
fit1Te <- te[,c(31,varSel)]
colnames(fit1Te)[2]<- "x"
yHat1 <- predict(fit1, newdata=fit1Te)
MSE1 <- MSE(te$y,yHat1)
MAE1 <- MAE(te$y,yHat1)
MAPE1 <- MAPE(te$y,yHat1)
#model 2: regression, "take the best"
fit2Tr <- tr[,c(31,best)]
colnames(fit2Tr)[2]<- "x"
fit2 <- lm(y~x, data=fit2Tr)
## extract point forecasts
fit2Te <- te[,c(31,best)]
colnames(fit2Te)[2]<- "x"
yHat2 <- predict(fit2, newdata=fit2Te)
MSE2 <- MSE(te$y,yHat2)
MAE2 <- MAE(te$y,yHat2)
MAPE2 <- MAPE(te$y,yHat2)
#model 3: regression, all
fit3 <- lm(y~. , data=tr)
## extract point forecasts
yHat3 <- predict(fit3, newdata=te)
MSE3 <- MSE(te$y,yHat3)
MAE3 <- MAE(te$y,yHat3)
MAPE3 <- MAPE(te$y,yHat3)
#model 4: tree, 1 good variable, randomly selected
fit4Tr <- tr[,c(31,varSel)]
colnames(fit4Tr)[2]<- "x"
fit4 <- tree(y~x, data=fit1Tr)
## extract point forecasts
fit4Te <- te[,c(31,varSel)]
colnames(fit4Te)[2]<- "x"
yHat4 <- predict(fit4, newdata=fit1Te)
MSE4 <- MSE(te$y,yHat4)
MAE4 <- MAE(te$y,yHat4)
MAPE4 <- MAPE(te$y,yHat4)
#model 5: tree, "take the best"
fit5Tr <- tr[,c(31,best)]
colnames(fit5Tr)[2]<- "x"
fit5 <- tree(y~x, data=fit5Tr)
## extract point forecasts
fit5Te <- te[,c(31,best)]
colnames(fit5Te)[2]<- "x"
yHat5 <- predict(fit5, newdata=fit5Te)
MSE5 <- MSE(te$y,yHat5)
MAE5 <- MAE(te$y,yHat5)
MAPE5 <- MAPE(te$y,yHat5)
#model 6: tree, all
fit6 <- tree(y~. , data=tr)
## extract point forecasts
yHat6 <- predict(fit6, newdata=te)
MSE6 <- MSE(te$y,yHat6)
MAE6 <- MAE(te$y,yHat6)
MAPE6 <- MAPE(te$y,yHat6)
### simple mean
yHatMean <- apply(cbind(yHat1,yHat2,yHat3,yHat4,yHat5,yHat6),1, mean)
MSEM <- MSE(te$y,yHatMean)
MAEM <- MAE(te$y,yHatMean)
MAPEM <- MAPE(te$y,yHatMean)
## random Forest as Model
fitRF <- randomForest(y~., data=tr, ntree=200)
yHatRF <- predict(fitRF, newdata=te)
MSERF <- MSE(te$y,yHatRF)
MAERF <- MAE(te$y,yHatRF)
MAPERF <- MAPE(te$y,yHatRF)
### mean, including RF
yHatMeanRf <- apply(cbind(yHat1,yHat2,yHat3,yHat4,yHat5,yHat6, yHatRF),1, mean)
MSEMRF <- MSE(te$y,yHatMeanRf)
MAEMRF <- MAE(te$y,yHatMeanRf)
MAPEMRF <- MAPE(te$y,yHatMeanRf)
# rf as meta ensemble
rfTr <-cbind.data.frame(tr$y, predict(fit1), predict(fit2), predict(fit3), predict(fit4), predict(fit5), predict(fit6))
colnames(rfTr) <- c("y", "pred1", "pred2", "pred3", "pred4", "pred5", "pred6")
fitRFmeta <- randomForest(y~., data=rfTr, ntree=200)
rfTe <- cbind.data.frame(te$y, yHat1, yHat2, yHat3, yHat4, yHat5, yHat6)
colnames(rfTe) <- c("y", "pred1", "pred2", "pred3", "pred4", "pred5", "pred6")
yHatRFmeta <- predict(fitRFmeta, newdata=rfTe)
MSERFm <- MSE(te$y,yHatRFmeta)
MAERFm <- MAE(te$y,yHatRFmeta)
MAPERFm <- MAPE(te$y,yHatRFmeta)
# rf as meta ensemble including RF
rfTr <-cbind.data.frame(tr$y, predict(fit1), predict(fit2), predict(fit3), predict(fit4), predict(fit5), predict(fit6), predict(fitRF))
colnames(rfTr) <- c("y", "pred1", "pred2", "pred3", "pred4", "pred5", "pred6", "predRF")
fitRFmeta2 <- randomForest(y~., data=rfTr, ntree=200)
rfTe <- cbind.data.frame(te$y, yHat1, yHat2, yHat3, yHat4, yHat5, yHat6, yHatRF)
colnames(rfTe) <- c("y", "pred1", "pred2", "pred3", "pred4", "pred5", "pred6", "predRF")
yHatRFmeta2 <- predict(fitRFmeta2, newdata=rfTe)
MSERFrfm <- MSE(te$y,yHatRFmeta2)
MAERFrfm <- MAE(te$y,yHatRFmeta2)
MAPERFrfm <- MAPE(te$y,yHatRFmeta2)
dfRes <- rbind(dfRes, c(MSE1,MSE2,MSE3,MSE4,MSE5,MSE6,
MSERF, MSEM,MSEMRF, MSERFm, MSERFrfm,
MAE1,MAE2,MAE3,MAE4,MAE5,MAE6,
MAERF, MAEM,MAEMRF, MAERFm, MAERFrfm,
MAPE1,MAPE2,MAPE3,MAPE4,MAPE5,MAPE6,
MAPERF, MAPEM,MAPEMRF, MAPERFm, MAPERFrfm))
colnames(dfRes) <- c("MSE1","MSE2","MSE3","MSE4","MSE5","MSE6",
"MSERF", "MSEM","MSEMRF", "MSERFm", "MSERFrfm",
"MAE1","MAE2","MAE3","MAE4","MAE5","MAE6",
"MAERF", "MAEM","MAEMRF", "MAERFm", "MAERFrfm",
"MAPE1","MAPE2","MAPE3","MAPE4","MAPE5","MAPE6",
"MAPERF", "MAPEM","MAPEMRF", "MAPERFm", "MAPERFrfm")
}
summary(dfRes)
## MSE1 MSE2 MSE3 MSE4
## Min. :292.7 Min. :274.1 Min. :252.0 Min. :240.4
## 1st Qu.:386.8 1st Qu.:359.3 1st Qu.:320.5 1st Qu.:373.8
## Median :424.3 Median :386.8 Median :349.0 Median :416.0
## Mean :422.7 Mean :391.7 Mean :355.0 Mean :416.2
## 3rd Qu.:452.2 3rd Qu.:429.5 3rd Qu.:387.2 3rd Qu.:453.5
## Max. :605.1 Max. :545.0 Max. :518.6 Max. :593.8
## MSE5 MSE6 MSERF MSEM
## Min. :289.7 Min. :160.3 Min. :154.6 Min. :224.5
## 1st Qu.:364.2 1st Qu.:241.3 1st Qu.:193.5 1st Qu.:299.4
## Median :400.9 Median :260.6 Median :218.2 Median :330.4
## Mean :402.1 Mean :267.4 Mean :221.4 Mean :329.3
## 3rd Qu.:436.2 3rd Qu.:294.4 3rd Qu.:242.3 3rd Qu.:357.8
## Max. :563.6 Max. :372.8 Max. :325.0 Max. :472.9
## MSEMRF MSERFm MSERFrfm MAE1
## Min. :213.0 Min. :156.2 Min. :154.1 Min. :12.99
## 1st Qu.:279.3 1st Qu.:224.4 1st Qu.:193.7 1st Qu.:14.70
## Median :309.3 Median :237.8 Median :219.0 Median :15.49
## Mean :307.4 Mean :244.7 Mean :221.6 Mean :15.49
## 3rd Qu.:333.4 3rd Qu.:267.7 3rd Qu.:239.5 3rd Qu.:16.29
## Max. :443.1 Max. :345.1 Max. :303.4 Max. :18.10
## MAE2 MAE3 MAE4 MAE5
## Min. :12.37 Min. :12.07 Min. :11.81 Min. :12.69
## 1st Qu.:14.16 1st Qu.:13.15 1st Qu.:14.72 1st Qu.:14.39
## Median :14.83 Median :13.85 Median :15.45 Median :14.99
## Mean :14.78 Mean :13.86 Mean :15.37 Mean :15.01
## 3rd Qu.:15.52 3rd Qu.:14.38 3rd Qu.:16.20 3rd Qu.:15.64
## Max. :16.87 Max. :16.64 Max. :18.02 Max. :17.12
## MAE6 MAERF MAEM MAEMRF
## Min. : 9.656 Min. : 9.211 Min. :11.23 Min. :10.93
## 1st Qu.:11.862 1st Qu.:10.502 1st Qu.:12.97 1st Qu.:12.57
## Median :12.253 Median :11.120 Median :13.58 Median :13.12
## Mean :12.397 Mean :11.032 Mean :13.58 Mean :13.11
## 3rd Qu.:13.010 3rd Qu.:11.517 3rd Qu.:14.27 3rd Qu.:13.81
## Max. :14.420 Max. :12.937 Max. :15.63 Max. :15.07
## MAERFm MAERFrfm MAPE1 MAPE2
## Min. : 9.666 Min. : 9.423 Min. : 154.6 Min. : 124.3
## 1st Qu.:11.272 1st Qu.:10.609 1st Qu.: 257.2 1st Qu.: 221.2
## Median :11.789 Median :11.107 Median : 335.0 Median : 302.6
## Mean :11.770 Mean :11.151 Mean : 440.6 Mean : 411.1
## 3rd Qu.:12.250 3rd Qu.:11.737 3rd Qu.: 495.8 3rd Qu.: 410.2
## Max. :14.140 Max. :13.096 Max. :2246.0 Max. :2492.2
## MAPE3 MAPE4 MAPE5 MAPE6
## Min. : 131.0 Min. : 142.5 Min. : 109.2 Min. : 95.31
## 1st Qu.: 203.5 1st Qu.: 259.4 1st Qu.: 227.3 1st Qu.: 182.55
## Median : 267.0 Median : 331.9 Median : 313.2 Median : 242.61
## Mean : 367.8 Mean : 427.4 Mean : 417.1 Mean : 292.30
## 3rd Qu.: 371.5 3rd Qu.: 484.0 3rd Qu.: 434.1 3rd Qu.: 333.36
## Max. :2353.3 Max. :2258.4 Max. :2239.4 Max. :1951.07
## MAPERF MAPEM MAPEMRF MAPERFm
## Min. : 104.1 Min. : 125.0 Min. : 120.9 Min. : 116.9
## 1st Qu.: 156.4 1st Qu.: 213.8 1st Qu.: 203.8 1st Qu.: 175.6
## Median : 207.2 Median : 289.1 Median : 278.8 Median : 232.1
## Mean : 280.5 Mean : 377.4 Mean : 362.1 Mean : 291.1
## 3rd Qu.: 295.5 3rd Qu.: 392.8 3rd Qu.: 376.4 3rd Qu.: 301.4
## Max. :1685.3 Max. :2243.8 Max. :2162.6 Max. :2056.1
## MAPERFrfm
## Min. : 111.6
## 1st Qu.: 160.7
## Median : 219.5
## Mean : 275.8
## 3rd Qu.: 289.3
## Max. :1771.0
# plot the mean of the errortypes
#png("BeautyContestErrors.png", type="cairo", res=300, unit="cm", width=30, height=30)
par(mfrow=c(3,1))
barplot(sapply(dfRes, mean)[1:11], main="Mean Squared Error",
col = c(rep("grey", 6), "darkorange", "darkblue", rep("grey", 3)))
barplot(sapply(dfRes, mean)[12:22], main="Mean Absolute Error",
col = c(rep("grey", 6), "darkorange", "darkblue", rep("grey", 3)))
barplot(sapply(dfRes, mean)[23:33], main="Mean Absolute Percentage Error",
col = c(rep("grey", 6), "darkorange", "darkblue", rep("grey", 3)))
#dev.off()
dfRes$minMape <- apply(dfRes[,23:33],1,function(x) colnames(dfRes[,23:33])[which.min(x)])
dfRes$minMape <- factor(dfRes$minMape, levels=c(colnames(dfRes)[23:33]))
# How often has which model the lowest MAPE?
#png("BeautyContestMape.png", type="cairo", res=300, unit="cm", width=35, height=10)
plot(dfRes$minMape)
#dev.off()
Kommentare
Kommentar veröffentlichen