Dies ist ein kleines R-script, dass ich im Kurs "Data Mining für SozialwissenschaftlerInnen" nutze. Angelehnt ist es an Ledolter, Johannes. 2013. Business analytics and data mining with R. Hoboken, New Jersey: Wiley.
Ich verwende aber den Residential Energy Consumption Survey (RECS).
View(data)
View(data[1:10,1:10])
sa = sample(c(1:dim(data)[1]), 10)
View(data[sa,])
dim(data)
colnames(data)
summary(data)
View(meta)
summary(data$KWH)
by(data$KWH, data$HHSEX, summary)
sex.tbl = table(data$HHSEX)
sex.tbl
library(lattice)
barchart(sex, ylab= "Sex of Householder", col="black")
age.sex.tbl = table(Age = data$HHAGE, Sex = data$HHSEX)
age.sex.tbl
dev.off()
trellis.device()
barchart(age.sex.tbl)
barchart(age.sex.tbl, horizontal = FALSE, groups = FALSE, xlab="Sex of Householder", col="black")
data$YEARMADERANGE = as.factor(data$YEARMADERANGE)
ylevels = c("Before 1950", "1950 to 1959", "1960 to 1969",
"1970 to 1979", "1980 to 1989", "1990 to 1999",
"2000 to 2004", "2005 to 2009")
levels(data$YEARMADERANGE) = ylevels
histogram(~ KWH | YEARMADERANGE, data = data, col="black")
histogram(~ log(KWH) | YEARMADERANGE, data = data, col="black")
dotplot(~ KWH | YEARMADERANGE, data = data, col="black")
xyplot(data$KWH ~ data$YEARMADE, col="black")
smoothScatter(data$KWH ~ data$YEARMADE)
boxplot(data$KWH ~ data$YEARMADERANGE)
bwplot(KWH ~ YEARMADERANGE | factor(UR), data=data)
data$MONEYPY = as.factor(data$MONEYPY)
moneyLevels = c("Less than $2,500","$2,500 to $4,999","$5,000 to $7,499",
"$7,500 to $9,999","$10,000 to $14,999","$15,000 to $19,999",
"$20,000 to $24,999","$25,000 to $29,999","$30,000 to $34,999",
"$35,000 to $39,999","$40,000 to $44,999","$45,000 to $49,999",
"$50,000 to $54,999","$55,000 to $59,999","$60,000 to $64,999",
"$65,000 to $69,999","$70,000 to $74,999","$75,000 to $79,999",
"$80,000 to $84,999","$85,000 to $89,999","$90,000 to $94,999",
"$95,000 to $99,999","$100,000 to $119,999","$120,000 or More")
levels(data$MONEYPY) = moneyLevels
tbl = tapply(log(data$KWH), INDEX=list(cut(data$YEARMADE,breaks=24),data$MONEYPY), FUN=mean, na.rm=TRUE)
library(RColorBrewer)
brewer.div <- colorRampPalette(brewer.pal(11, "Spectral"), interpolate = "spline")
levelplot(tbl, scales = list(x = list(rot = 90)), main="Energy Consumption by Income and Building Year",
col.regions = brewer.div(200))
Im Prinzip gilt also: Je mehr (!) Geld die Haushalte zur Verfügung haben und je neuer (!) die Häuser sind, um so mehr Energie wird verbraucht!
Ich verwende aber den Residential Energy Consumption Survey (RECS).
# data = read.csv("http://www.eia.gov/consumption/residential/data/2009/csv/recs2009_public.csv")
# write.csv(data, "recs2009_public.csv")data = read.csv("recs2009_public.csv")
View(data)
View(data[1:10,1:10])
sa = sample(c(1:dim(data)[1]), 10)
View(data[sa,])
dim(data)
colnames(data)
summary(data)
# meta = read.csv("http://www.eia.gov/consumption/residential/data/2009/csv/public_layout.csv")
# write.csv(meta, "public_layout.csv")
# Auch in das Codebook schauen: http://www.eia.gov/consumption/residential/data/2009/xls/recs2009_public_codebook.xlsxmeta = read.csv("public_layout.csv")
View(meta)
summary(data$KWH)
by(data$KWH, data$HHSEX, summary)
sex.tbl = table(data$HHSEX)
sex.tbl
library(lattice)
barchart(sex, ylab= "Sex of Householder", col="black")
age.sex.tbl = table(Age = data$HHAGE, Sex = data$HHSEX)
age.sex.tbl
dev.off()
trellis.device()
barchart(age.sex.tbl)
barchart(age.sex.tbl, horizontal = FALSE, groups = FALSE, xlab="Sex of Householder", col="black")
data$YEARMADERANGE = as.factor(data$YEARMADERANGE)
ylevels = c("Before 1950", "1950 to 1959", "1960 to 1969",
"1970 to 1979", "1980 to 1989", "1990 to 1999",
"2000 to 2004", "2005 to 2009")
levels(data$YEARMADERANGE) = ylevels
histogram(~ KWH | YEARMADERANGE, data = data, col="black")
histogram(~ log(KWH) | YEARMADERANGE, data = data, col="black")
dotplot(~ KWH | YEARMADERANGE, data = data, col="black")
xyplot(data$KWH ~ data$YEARMADE, col="black")
smoothScatter(data$KWH ~ data$YEARMADE)
boxplot(data$KWH ~ data$YEARMADERANGE)
bwplot(KWH ~ YEARMADERANGE | factor(UR), data=data)
data$MONEYPY = as.factor(data$MONEYPY)
moneyLevels = c("Less than $2,500","$2,500 to $4,999","$5,000 to $7,499",
"$7,500 to $9,999","$10,000 to $14,999","$15,000 to $19,999",
"$20,000 to $24,999","$25,000 to $29,999","$30,000 to $34,999",
"$35,000 to $39,999","$40,000 to $44,999","$45,000 to $49,999",
"$50,000 to $54,999","$55,000 to $59,999","$60,000 to $64,999",
"$65,000 to $69,999","$70,000 to $74,999","$75,000 to $79,999",
"$80,000 to $84,999","$85,000 to $89,999","$90,000 to $94,999",
"$95,000 to $99,999","$100,000 to $119,999","$120,000 or More")
levels(data$MONEYPY) = moneyLevels
tbl = tapply(log(data$KWH), INDEX=list(cut(data$YEARMADE,breaks=24),data$MONEYPY), FUN=mean, na.rm=TRUE)
library(RColorBrewer)
brewer.div <- colorRampPalette(brewer.pal(11, "Spectral"), interpolate = "spline")
levelplot(tbl, scales = list(x = list(rot = 90)), main="Energy Consumption by Income and Building Year",
col.regions = brewer.div(200))
Norman Simon Rodriguez hat auf seinem Blog einen interesanten kleinen Beitrag, wie man den Mittelwert in eimem boxplot ergänzt:
AntwortenLöschenhttp://mathsuser.blogspot.de/2014/05/20-including-mean-in-boxplots-using-r.html