library(RCurl)
library(reshape)
library(htmltab)
library(ggplot2)
library(stringr)
#get the table from the url
theurl <- getURL("http://en.wiki.x.io/wiki/Results_of_the_Republican_Party_presidential_primaries,_2016", ssl.verifyPeer=FALSE)
table <- htmltab(theurl, which=7)
#keep only the useful columns and name them
df <- table[3:61, 1:ncol(table)-1]
names(df)[1:2] <- c("Date", "State")
#extract delegate count
for (i in 3:ncol(df)) {
df[[i]] = str_extract_all(df[[i]], "[0-9,]+ delegate")
df[[i]] = as.numeric(str_extract(df[[i]], "[0-9,]+"))
df[is.na(df)] = 0
}
#sum up minor candidates
df[[7]] = rowSums(df[,7:14], na.rm=TRUE)
names(df)[7] = "Others"
df = df[,1:7]
#extract state names
df[[2]] = str_replace(df[[2]], "Binding[A-Za-z ]+", "")
df[[2]] = str_replace(df[[2]], "Territorial[A-Za-z ]+", "")
df[[2]] = str_replace(df[[2]], "State[A-Za-z ]+", "")
df[[2]] = str_replace(df[[2]], "District Co[A-Za-z ]+", "")
df[[2]] = str_replace(df[[2]], "Conv[A-Za-z ]+", "")
#extract date
df$Date = as.Date(df$Date, format="%b %d")
#order dates
df=df[order(as.Date(df$Date, format="%d/%m/%Y")),]
#remove contests that haven't happened
df = df[rowSums(df[,3:7], na.rm=TRUE)!=0,]
#new dataframes for unique dates and cumulative count
df2 = data.frame(Date=unique(df$Date))
df3 = df2
for (i in 3:7) {
df2[[names(df)[i]]] = rep(NA, length(df2$Date))
df3[[names(df)[i]]] = rep(NA, length(df2$Date))
for (j in 1:length(df2$Date)) {
df2[[names(df)[i]]][j] = sum(subset(df, Date==df2$Date[j])[[names(df)[i]]])
df3[[names(df)[i]]][j] = sum(df2[[names(df)[i]]][1:j])
}
}
#reshape data so that candidate becomes a variable
mdata <- melt(df2, id=c("Date"))
mdata2 <- melt(df3, id=c("Date"))
names(mdata) <- c("Date", "Candidate", "Delegates")
mdata[["Cumulative"]] <- mdata2[, 3]
results = mdata
#nomination threshold
nomination <- data.frame(yintercept=1237, nomination=factor(1237))
#colors
colors <- c("#283681", "#DAA520", "#29AB87", "#C60E3B", "#000000")
labels = c("Trump", "Cruz", "Kasich", "Rubio", "Other")
#generate plot
d <- ggplot(results, aes(x=Date, y=Cumulative, group=Candidate, colour=Candidate)) +
geom_hline(aes(yintercept=yintercept, linetype=nomination), data=nomination, show.legend=TRUE) +
geom_path() +
geom_point() +
scale_color_manual(values=colors, labels=labels) +
scale_linetype_manual(values=c("dashed")) +
labs(x="Date", y="Delegates", title="Cumulative delegate count", linetype="Nomination")
#display plot
d