Monday, August 8, 2011

Percentiles for plotting box and whiskers

# this calculates percentile values and median, then spits out a text file with data. Set to 2.5th ,25th ,50th and 97.5th percentiles.
# it was only recently that i realized that the median is the 50th percentile (right?) and you can just skip the whole "create 2 dataframes and merge before filling in the blank rows" thing by including 0.5 in the by() line above. Following which you just need to fill in any dummy lines necessary.
# the objective here is to produce a dataframe with a _specific_ number of rows (ie: cases). This can be repeated as many times as necessary.
# this is so when you plot it later, all your graphs will have identical x-axis(es?) when you do a big-ass cluster of graphs (see associated gnuplot script)

# Agt.Smith is the name of the dataframe that you wanna use for this. If necessary:
# Agt.Smith <- infile # uncomment if you want to use this, but do i really even have to note this???


Var.Agt.Smith <- data.frame(Agt.Smith$Var,Agt.Smith$CatVar)
colnames(Var.Agt.Smith) <- c("Var","CatVar")
Var.Agt.Smith$CatVar <- factor(Var.Agt.Smith$CatVar)
Var.percentile <- by(Var.Agt.Smith[,1],Var.Agt.Smith$CatVar,quantile,probs=c(0.025, 0.25, 0.75, 0.975),na.rm=T) # change the percentiles as you will.
df1 <- data.frame((matrix(unlist(Var.percentile),nrow=n,byrow=T)),row.names=names(Var.percentile)) # where n is the number of levels in CatVar
colnames(df1) <- c("alpha","bravo","charlie","delta")# these are just generic names for the percentiles calculated above. Feel free to change to something more sensible.
Var.median <- tapply(Var.Agt.Smith$Var,Var.Agt.Smith$CatVar,median,na.rm=T)
df2 <- data.frame(Var.median)
Var.Agt.Smith <- merge(df1,df2,by="row.names")
#blnk.rw <- data.frame(Row.names=c("missing_rowname1","missing_rowname2"),"alpha"=c(NA,NA),bravo=c(NA,NA),charlie=c(NA,NA),delta=c(NA,NA),Var.median=c(NA,NA))# - OPTIONAL - if you need to fill in empty rows
Var.Agt.Smith <- rbind(Var.Agt.Smith,blnk.rw)
Var.Agt.Smith2 <- Var.Agt.Smith[order(Var.Agt.Smith$Row.names),]
Var.Agt.Smith2
Var.Agt.Smith2$dummy.var <- c(1:n) # where n is the number of cases you want
colnames(Var.Agt.Smith2) <- c("#Row.names","alpha","bravo","charlie","delta","Var.median") # at this point, the data frame for the final output willl have 6 columns containing the rownames(which we use to insert dunmy cases into the correct position when necessary), the 4 percentiles, and the median, _in that order_. Feel free to rearrange them in another dataframe if it suits you.
write.table(Var.Agt.Smith2,"path/to/file/Var/Varoutfile.txt",sep="\t",quote=FALSE, row.names=FALSE)

No comments:

Post a Comment