I pointed to the great guys at MCFC Analytics and Opta that having the table they provided in the appendix as Excel files (or whatever manageable format) would be a great time saver.
And they promptly provided me with what I asked for.
I'm not sure about the policy about putting that Excel file here, thus I won't do that.
However I'm pretty sure you can obtain it from them.
Below, an updated version of my code.
A few notes.
- I converted the .xlsx file Opta sent me to an .xls file, because I had some problem with the XLSX R package.
- The code can be run without the Excel file: I noted the code you should skip in that case.
- The code now parses some match info, like the teams, the players (as suggested in a comment), the scoring order. It does not grab everything in the F7 dataset, but it should be easily modifiable (or just ask in the comments).
- I wrote some code to add info to the events data set, like what team is involved and the score at that moment. Due to my familiarity with US sport it's away first... I know in soc... ahem... football should be the other way around.
library(XML) library(plyr) library(reshape) library(gdata) f7 <- "c:/download/mcfc/Bolton_ManCityF7.xml" #file path & name (f7) f24 <- "c:/download/mcfc/Bolton_ManCityF24.xml" #(f24) #in case you have event and qualifier descriptions in xls file... (otherwhise comment the following 2 lines) evNames <- read.xls("c:/download/mcfc/Event Definitions - Excel file.xls", sheet=1, as.is=T) quNames <- read.xls("c:/download/mcfc/Event Definitions - Excel file.xls", sheet=2, as.is=T) #utility function grabAll <- function(XML.parsed, field){ parse.field <- xpathSApply(XML.parsed, paste("//", field, "[@*]", sep="")) results <- t(sapply(parse.field, function(x) xmlAttrs(x))) if(typeof(results)=="list"){ do.call(rbind.fill, lapply(lapply(results, t), data.frame, stringsAsFactors=F)) } else { as.data.frame(results, stringsAsFactors=F) } } #team parsing gameParse <- xmlInternalTreeParse(f7) teamParse <- xpathSApply(gameParse, "//TeamData") teamParse2 <- xpathSApply(gameParse, "//Team/Name") teamInfo <- data.frame( team_id = sapply(teamParse, function(x) xmlGetAttr(node=x, "TeamRef")) , team_side = sapply(teamParse, function(x) xmlGetAttr(node=x, "Side")) , team_name = sapply(teamParse2, function(x) xmlValue(x)) , stringsAsFactors=F ) #players parsing playerParse <- xpathSApply(gameParse, "//Team/Player") lineupParse <- xpathSApply(gameParse, "//Team") NPlayers <- sapply(lineupParse, function(x) sum(names(xmlChildren(x)) == "Player")) playerInfo <- data.frame( player_id = sapply(playerParse, function(x) xmlGetAttr(node=x, "uID")) , team_id = c(rep(teamInfo$team_id[1], NPlayers[1]), rep(teamInfo$team_id[2], NPlayers[2])) , position = sapply(playerParse, function(x) xmlGetAttr(node=x, "Position")) , first_name = sapply(playerParse, function(x) xmlValue(xmlChildren(xmlChildren(x)$PersonName)$First)) , last_name = sapply(playerParse, function(x) xmlValue(xmlChildren(xmlChildren(x)$PersonName)$Last)) ) #scoring order goalInfo <- grabAll(teamParse[[1]], "Goal") goalInfo$TimeStamp <- as.POSIXct(goalInfo$TimeStamp, format="%Y%m%dT%H%M%S") scoringOrderInfo <- goalInfo[order(goalInfo$TimeStamp), c("TimeStamp", "uID")] scoringOrderInfo$team_id <- substr(gsub("g", "t", scoringOrderInfo$uID), 1, 3) scoringOrderInfo <- merge(scoringOrderInfo, teamInfo) scoringOrderInfo$Away <- 0 scoringOrderInfo$Home <- 0 for(i in 1: dim(scoringOrderInfo)[1]){ dt <- subset(scoringOrderInfo, TimeStamp <= scoringOrderInfo$TimeStamp[i]) scoringOrderInfo[i,c("Away", "Home")] <- table(dt$team_side) } scoringOrderInfo$Score <- paste(scoringOrderInfo$Away, scoringOrderInfo$Home, sep="-") scoringOrderInfo <- scoringOrderInfo[order(scoringOrderInfo$TimeStamp),] #Play-by-Play Parsing pbpParse <- xmlInternalTreeParse(f24) eventInfo <- grabAll(pbpParse, "Event") eventParse <- xpathSApply(pbpParse, "//Event") NInfo <- sapply(eventParse, function(x) sum(names(xmlChildren(x)) == "Q")) QInfo <- grabAll(pbpParse, "Q") EventsExpanded <- as.data.frame(lapply(eventInfo[,1:2], function(x) rep(x, NInfo)), stringsAsFactors=F) QInfo <- cbind(EventsExpanded, QInfo) names(QInfo)[c(1,3)] <- c("Eid", "Qid") QInfo$value <- ifelse(is.na(QInfo$value), -1, QInfo$value) Qual <- cast(QInfo, Eid ~ qualifier_id) #comment the following loop if you have commented the xls files loading at the beginning for(i in names(Qual)[-1]){ txt <- quNames[which(quNames$id==as.integer(i)), "name"] txt <- gsub('[[:space:]]+$', '', txt) lbl <- tolower(gsub("-", "_", gsub(" ", "_", txt, fixed=T), fixed=T)) names(Qual)[which(names(Qual)==i)] <- lbl } #final data set events <- merge(eventInfo, Qual, by.x="id", by.y="Eid", all.x=T, suffixes=c("", "Q")) #adjustment of variables events$TimeStamp <- as.POSIXct(events$timestamp, format="%Y-%m-%dT%H:%M:%S") events$x <- as.double(events$x) events$y <- as.double(events$y) events$Score <- cut(events$TimeStamp, c(min(events$TimeStamp), scoringOrderInfo$TimeStamp, max(events$TimeStamp)+1), c("0-0", scoringOrderInfo$Score)) events$team_id <- paste("t", events$team_id, sep="") events <- merge(events, teamInfo)