I now present the R-code for the random forest prediction (the group-variable was added here):
df=read.csv("WorldCup2014Test.csv")
WC=read.csv("WorldCup2014.csv")
#I am setting all NAs to 0. This might be a bad idea, but it works.
df[is.na(df)]=0
#We want to run a randomforest as classifier
#First, we code a response variable Y with "w" (win), "d" (draw)
#and "l" (loss)
df$Y=ifelse(df[,4]-df[,3]>0,"w",ifelse(df[,4]-df[,3]==0,"d","l"))
df$Y=as.factor(df$Y)
library(randomForest)
#We want to auto tune the random forest: This requires
#response and predictors to be in a matrix
y=as.matrix(df$Y)
x=as.matrix(df[,-c(1,2,3,4,5,6,76)])
rf.tune = tuneRF(x=x,y=as.factor(y), type="pob", doBest=T)
#The random forest includes a confusion matrix
rf.tune$confusion
#The model is poor in draws, but quite good in predicting wins
#Let's predict the World Cup!
WC[is.na(WC)]=0
xte=as.matrix(WC[,-c(1,2,3,4,5,6,76,77)])
pred.rf= predict(rf.tune, xte)
#We can also get the probabilities...
WC.rf=predict(rf.tune, xte, type="prob")
#Now we can build a new data-frame with Teams, Group and
#probabilities:
WCres.rf=cbind(WC[,c(1,2,76,77)],WC.rf)
View(WCres.rf)
#Transform the teams to factors
WCres.rf[,1]=as.factor(WCres.rf[,1])
WCres.rf[,2]=as.factor(WCres.rf[,2])
#In the data home or away matters. But this is not meaningfull in the WC.
#The next data.frame tries to work around this problem:
WCres1 = WCres.rf[,c(1,2,3,7)]
colnames(WCres1)=c("Team1", "Team2","Group", "Prob")
WCres2 = WCres.rf[,c(1,2,4,6)]
colnames(WCres2)=c("Team1", "Team2", "Group", "Prob")
WCres=rbind(WCres1,WCres2)
write.csv(WCres,"WCWinningProbs.csv", row.names=F)
#And this gives us a plot of the winning probabilities for all teams!
png("WCrfBP.png", 1400,600)
plot(WCres$Prob~WCres$Team)
dev.off()
# And if we take only games from teams out of the same group
# we have the winning probabilities for the group-phase.
WCresGroups =subset(WCres.rf, Group1==Group2)
WCres1 = WCresGroups[,c(1,3,7)]
colnames(WCres1)=c("Team", "Group", "Prob")
WCres2 = WCresGroup[,c(2,4,6)]
colnames(WCres2)=c("Team", "Group", "Prob")
WCres=rbind(WCres1,WCres2)
png("WCrfGroupBP.png", 1400,600)
plot(WCres$Prob~WCres$Team)
dev.off()
Kommentare
Kommentar veröffentlichen