##################################### #### CAAP Evaluation script V1.1 #### ##################################### ## Use rindex package if available #rindex.avail <- require(rindex) rindex.avail <- FALSE scanText <- function(string, what = character(0), ...){ tc <- textConnection(string) result <- scan(tc, what = what, quiet = TRUE, ...) close(tc) result } readGoldStandard <- function(filename) { al <- list(); scount <- 0; if (!file.exists(filename)) return(NA) tb <- readLines(filename,n=-1); l<-1 while (l <= length(tb)) { ## skip empty lines and comments while (l < length(tb) && ((tb[l] == "") || (regexpr(paste("^", "#", sep = ""), tb[l])[[1]] != -1))) l <- l+1 if (l > length(tb)) break; tmp <- scanText(tb[l]) l<-l+1 L <- length(tmp) if ((L %% 5) != 0) stop('Line ',l,' of file ',filename,' did not contain n*5 elements. \n') LE <- L/5 int.idx <- seq(3,(LE*5) -2 ,by=5) rt.idx <- seq(4,(LE*5) -1 ,by=5) mz.idx <- seq(5,(LE*5) ,by=5) int <- as.numeric(tmp[int.idx]) rt <- as.numeric(tmp[rt.idx]) mz <- as.numeric(tmp[mz.idx]) if ((length(int) != length(rt)) || (length(int) != length(mz)) || (length(int) != LE)) stop(filename,'..something went wrong during parsing consenus data ! \n') scount <- scount + 1 al[[scount]] <- cbind(int,rt,mz) } al } readGoldStandardWithNames <- function(filename) { al <- list(); scount <- 0; if (!file.exists(filename)) return(NA) tb <- readLines(filename,n=-1); l<-1 while (l <= length(tb)) { ## skip empty lines and comments while (l < length(tb) && ((tb[l] == "") || (regexpr(paste("^", "#", sep = ""), tb[l])[[1]] != -1))) l <- l+1 if (l > length(tb)) break; tmp <- scanText(tb[l]) l<-l+1 L <- length(tmp) if ((L %% 5) != 0) stop('Line ',l,' of file ',filename,' did not contain n*5 elements. \n') LE <- L/5 ## number of features in this line name.idx <- seq(1,(LE*5) -4 ,by=5) # score.idx <- seq(2,(LE*5) -3 ,by=5) int.idx <- seq(3,(LE*5) -2 ,by=5) rt.idx <- seq(4,(LE*5) -1 ,by=5) mz.idx <- seq(5,(LE*5) ,by=5) names <- tmp[name.idx] int <- as.numeric(tmp[int.idx]) rt <- as.numeric(tmp[rt.idx]) mz <- as.numeric(tmp[mz.idx]) if ((length(int) != length(rt)) || (length(int) != length(mz)) || (length(int) != LE)) stop(filename,'..something went wrong during parsing consenus data ! \n') scount <- scount + 1 m <- cbind(int,rt,mz) rownames(m) <- names al[[scount]] <- m } al } readConsensusTool <- function(filename) { al <- list(); nr <- 0; if (!file.exists(filename)) return(NA) tb <- readLines(filename,n=-1); l<-1 while (l <= length(tb)) { ## skip empty lines and comments while (l < length(tb) && ((tb[l] == "") || (regexpr(paste("^", "#", sep = ""), tb[l])[[1]] != -1))) l <- l+1 if (l > length(tb)) break; tmp <- scanText(tb[l]) l<-l+1 L <- length(tmp) if ((L %% 3) != 0) stop('Line ',l,' of file ',filename,' did not contain n*3 elements. \n') LE <- L/3 int.idx <- seq(1,(LE*3) -2 ,by=3) rt.idx <- seq(2,(LE*3) -1 ,by=3) mz.idx <- seq(3,(LE*3) ,by=3) int <- as.numeric(tmp[int.idx]) rt <- as.numeric(tmp[rt.idx]) mz <- as.numeric(tmp[mz.idx]) if ((length(int) != length(rt)) || (length(int) != length(mz)) || (length(int) != LE)) stop(filename,'..something went wrong during parsing consenus data ! \n') nr <- nr + 1 al[[nr]] <- cbind(int,rt,mz,nr) } al } eval <- function(goldfile,toolfile,round.rt = 2,round.mz = 2,verbose=0, debug.file=NULL) { gold <- readGoldStandard(goldfile) tool.list <- readConsensusTool(toolfile); ## build dictionary for tool tool <- do.call("rbind",tool.list) rownames(tool) <- paste(tool[,"int"],round(tool[,"rt"],round.rt),round(tool[,"mz"],round.mz)) ## create Index (hybrid static indexing) if package rindex is available if (rindex.avail) ind <- index(rownames(tool)) recall <- total.FP <- prec <- 0 N <- length(gold);lp <- -1; if (!is.null(debug.file)) debug.file.con <- file(debug.file,"w") for (i in 1:N) { ## for each gold_i perc <- round((i/N) * 100) ## show progress in percent if ((perc %% 10 == 0) && (perc != lp)) { cat(perc,round(recall / i,4),round(prec / i,4),'\n');flush.console(); lp <- perc } gold.i <- gold[[i]] rownames(gold.i) <- paste(gold.i[,"int"],round(gold.i[,"rt"],round.rt),round(gold.i[,"mz"],round.mz)) ## = |gold_i| NGold <- nrow(gold.i) ## find all matching tool.ij match.idx <- which(rownames(tool) %in% rownames(gold.i)) N.tool.ij <- length(match.idx) if (length(match.idx) > 0 ) { ## any hits ? tool.i <- tool[match.idx,,drop=FALSE] ## tool matches if (!is.matrix(tool.i)) ## R workaround: convert if tool.i is vector instead of matrix tool.i <- data.frame(t(tool.i)) group.idx <- unique(tool.i[,"nr"]) ## group indices tool.all <- tool[tool[,"nr"] %in% group.idx,] ## all tool consensus elements (including FP) in matching groups ## remove singletons tool.all.nfeat <- apply(tool.all,1,function(x) length(which(tool.all[,"nr"] == x["nr"])) ) tool.all <- tool.all[tool.all.nfeat > 1,,drop=FALSE] ## sincwe we removed features, we now have tp adjust tool.i and N.tool.ij tool.i.idx <- tool.i[,"nr"] %in% unique(tool.all[,"nr"]) tool.i <- tool.i[tool.i.idx,,drop=FALSE] N.tool.ij <- length(unique(tool.i[,"nr"])) if (N.tool.ij == 0) next ## = |\gold_i \cap \bigcup\limits_{j=1}^{M_i} \tool_{ij}| NToolMatching <- nrow(tool.i) ## |\bigcup\limits_{j=1}^{M_i} \tool_{ij}| NToolAll <- nrow(tool.all) if (NToolAll > NToolMatching) { ## count false positives FP <- NToolAll - N.tool.ij ## also count the total false positives ## \frac{\Big|\gold_i \cap \bigcup\limits_{j=1}^{M_i} \tool_{ij} \Big|}{|\bigcup\limits_{j=1}^{M_i} \tool_{ij}|} prec.i <- NToolMatching / NToolAll } else { FP <- 0 prec.i <- 1 } recall.i <- 0 M.i <- N.tool.ij ## number of groups of consensus elements (matching gold.i) in tool if (M.i > 0) recall.i <- ( NToolMatching / NGold ) / M.i if (!is.null(debug.file)) cat(i, NGold, NToolMatching, NToolAll, prec.i, recall.i, "\n", file=debug.file.con) ## \frac{1}{M_i} \frac{\Big|\gold_i \cap \bigcup\limits_{j=1}^{M_i} \tool_{ij} \Big|}{|\gold_i |} recall <- recall + recall.i prec <- prec + prec.i total.FP <- total.FP + FP } else { if (verbose>1) cat("No matching tool.ig for gold.i ; i=",i," \n") if (verbose>2) cat(rownames(gold.i)," \n") } } if (!is.null(debug.file)) close(debug.file.con) return(list(recall= recall / N,precision= prec / N,totalfp=total.FP )) }