GCG Super Script - Construction Version.R

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

#  Made By Derek J. Debrauske, 2021
#  ddebrauske@gmail.com
#  ddebrauske@wisc.edu

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

#Update, install and attach GCG package.

remove.packages("GCG")#remove GCG package from R root folder so you can update it.
detach(package:GCG)#remove GCG package from workspace so you can update it.
devtools::install_github("ddebrauske/GCG", force=TRUE)#install GCG package from github
library(GCG) #add package to current R environment


#run "help()" commands to see more information on each function
#run "head()" commands to preview data


#Optional - Set the working directory to the folder you'd like to work from
setwd("C:/Users/Derek Debrauske/Dropbox/R/Projects/20210330 GCG superscript testing/20210303 Chemgen validation R2/")


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##Act I: Gathering data
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~
#1. Step through plate layout input file, edited from template
#     This function returns a long data frame, combining all of the information from your plate layouts. Each row of this data frame refers to one well.
layout <- PlateLayout("Plate_Layout.csv") #change this path to refer to th elocation of your Plate_layout file.

#~~~~~~~~~~~~
#2. Do the same with the blank input file.
#     Returns a similar, long dataframe with wells as rows.
blank <- PlateBlank("Plate_Blanks.csv")  #change this path to refer to th elocation of your Plate_blank file.

#~~~~~~~~~~~~
#3. Combine all of the above information into one table.
layout.blanks <- CombineLayoutBlank(layoutDF = layout, blankDF = blank)

#~~~~~~~~~~~~
#Optional step -- if your plate reader data is exported from your plate reader as a workbook with many sheets, you can use this function to separate these sheets into separate .csv files to be used in the next steps. (uncomment to use)

# ExcelToCSV(path = "", out_dir = "")
# list.files("")

#~~~~~~~~~~~~
#4. Import plate reader data from .csv files into R.
Timepoint.data <- Import(path = "Plate_reader_data/", plate.reader.type = "spark", read.interval = 60)

#~~~~~~~~~~~~
#5. attach information from layout and blank to plate reader data. this function back-subtracts the blank values from the OD600 of corresponding wells.
# You can run this one of two ways:
#     1. Without back subtracting blanks: use only
#data.combined.no.blank <- TimeseriesLayoutBlank(timepoint.df = Timepoint.data, layout.df = layout)

#     2. With Back-subtracting blanks:
data.combined <- TimeseriesLayoutBlank(timepoint.df= Timepoint.data, layout.blank.df = layout.blanks)

#5.1 subset out all strains labeled as "ddH2O" these are my border wells
data.combined <- subset(data.combined, Strain != "ddH2O")
data.combined <- subset(data.combined, Time != 2880)


#~~~~~~~~~~~~
#6. Summarize technical replicates
#this finds mean, SE of biological replicates. if there are technical replicates, they are averaged first, then mean and SE is calculated from these means.
Tech.Rep.Summary <- SummarizeTechReps(data.combined)

#~~~~~~~~~~~~
#7. Summarize Biological Replicates

Bio.Rep.Summary <- SummarizeBioReps(Tech.Rep.Summary)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##Act II: Plotting Curves
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~
# Plot all replicate data as a wrap of all conditions
help("GCGplot_wrap")
p <- GCGplot_wrap(Bio.Rep.Summary, path= "./", graphic.title = "ChemGen Validation R2")
print(p)

#~~~~~~~~~~~~
# Plot individual conditions
help("GCGplot_conds")
GCGplot_conds(Bio.Rep.Summary, graphic.title ="ChemGen Validation R2", path= "./")#see results in folder


#~~~~~~~~~~~~
#plot all wells to spot-check plates.
help("GCGplot_matrices")
GCGplot_matrices(data.combined, path= "./" , graphic.title = "ChemGen Validation R2" )#see results in folder


#~~~~~~~~~~~~
#plot each biological rep as a separate facet_wrap
help("GCGplot_bioreps")
GCGplot_bioreps(data.combined, path = "./", graphic.title = "ChemGen Validation R2")#see results in folder


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ACT III: find eAUC of every curve in experiment
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#combine info into one column header, separate afterwards. 
data.combined.wide <- matrix(NA)
data.combined.wide <- data.combined.no.ddH2O

data.combined.wide[, ncol(data.combined.wide) + 1 ] <- mapply(paste, sep= "@", data.combined.wide$Strain , data.combined.wide$Condition, data.combined.wide$Bio_Rep,data.combined.wide$plate.name, data.combined.wide$Coordinate)
colnames(data.combined.wide)[8] <- "Strain.Cond.Rep.Plate.Coord"
colnames(data.combined.wide)[3] <- "time"

data.combined.wide <- data.combined.wide[c(3,4,8)]

data.combined.wide <- as.data.frame(tidyr::pivot_wider(data.combined.wide, values_from = OD600, names_from = "Strain.Cond.Rep.Plate.Coord"))#separate data combined into wide table

data.combined.wide$time <- data.combined.wide$time/60

#separate Growthcurver wide format dataframe into individual vectors, plot eAUC
#this loop is based on source code from growthcurver(line 138 in growthcurver/R/summarize-growth-by-plate.R )


eAUC_out <- as.data.frame(matrix(NA, 0,2))
colnames(eAUC_out) <- c("ID", "eAUC")

names <- colnames(data.combined.wide)
for( i in 1:length(names)){
  if(names[i] != "time"){
    
    x <- data.combined.wide$time
    n <- length(x)
    y <- data.combined.wide[,i]
    ID <- names[i]
    eAUC.i <- sum((x[2:n] - x[1:n-1]) * (y[2:n] + y[1:n-1]) /  2)
    
    data.i <- c(ID,eAUC.i)
    
    eAUC_out <- rbind(eAUC_out,data.i)
  }  
}    
colnames(eAUC_out) <- c("ID", "eAUC")
eAUC_out$eAUC <- as.numeric(eAUC_out$eAUC)

eAUC_out <- tidyr::separate(data=eAUC_out,col = ID, into = c("Strain", "Condition", "Bio_Rep", "Plate", "Coord"), sep= "@" )

eAUC_techreps <- plyr::ddply(eAUC_out, c("Strain", "Condition", "Bio_Rep"), plyr::summarise,
                             N    = sum(!is.na(eAUC)),
                             mean = mean(eAUC,na.rm=TRUE),
                             sd   = sd(eAUC,na.rm=TRUE),
                             se   = sd / sqrt(N))

eAUC_techreps <-  dplyr::rename("eAUC" = "mean", .data =  eAUC_techreps)

eAUC_bioreps <- plyr::ddply(eAUC_techreps, c("Strain", "Condition"), plyr::summarise,
                            N    = sum(!is.na(eAUC)),
                            mean = mean(eAUC,na.rm=TRUE),
                            sd  = sd(eAUC,na.rm=TRUE),
                            se   = sd / sqrt(N))
eAUC_bioreps <-  dplyr::rename("eAUC" = "mean", .data =  eAUC_bioreps)

#~~~~~~~~~~~~~~~~
#Plot Emperical AUC
p<- ggplot2::ggplot(eAUC_techreps, ggplot2::aes(x=Strain, y=eAUC, group=Strain, colour=Strain))+
  ggplot2::facet_wrap(~Condition)+
  ggplot2::geom_point(size= 3)+
  ggplot2::stat_summary(fun = mean, geom="point", shape=15, color="black", fill="black")+
  ggplot2::stat_summary(fun.data = ggplot2::mean_se, geom="errorbar", color= "black", width= 0.3)+
  ggplot2::theme(axis.text.x= ggplot2::element_blank())+
  ggplot2::labs(title= "Emperical AUC",
                x="Strain",
                y="AUC",
                ggplot2::element_text(size=15, face="bold"))
print(p)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##Act IV: Analyzing Results Using GrowthCurver and other functions
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# #~~~~~~~~~~~~
# # Convert data.combined plate reader data into growthcurver format.this is only if you want to go straight from data.combined.
# help("Growthcurver_convert")
# d <- Growthcurver_convert(data.combined) #output is /.growthcurverfile.csv
#
# #~~~~~~~~~~~~
# #Use Growthcurver to analyze raw data
# gc_plate <- growthcurver::SummarizeGrowthByPlate(d)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##Act V: Analyzing Biological replicates individually with Growthcurver and findgr
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~
#Analyze biological replicates seperately with Growthcurver

data.combined.gcr <- matrix(data= NA)
data.combined.gcr <- Tech.Rep.Summary

data.combined.gcr[, ncol(data.combined.gcr) + 1 ] <- mapply(paste, sep= "@", data.combined.gcr$Strain , data.combined.gcr$Condition, data.combined.gcr$Bio_Rep) #combines all strain, cond, bio rep infomation into one identifying column
data.combined.gcr <- data.combined.gcr[, c(ncol(data.combined.gcr),3:(ncol(data.combined.gcr)-1))] #reorders columns to be easier to look at , ID being in column 1
colnames(data.combined.gcr)[1] <- "Strain.Cond.Rep"
colnames(data.combined.gcr)[2] <- "time" #needs to be lowercase for growthcurver format
data.combined.gcr <- data.combined.gcr[c(1,2,5)]
data.combined.gcr.wide <- as.data.frame(tidyr::pivot_wider(data.combined.gcr, values_from = OD600, names_from = "Strain.Cond.Rep"))


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ACT VI: find growth rate
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  

#@@@ Under Construction
#~~~~~~~~~~~~~~~~
#Find maximum growth rate of curve using ln() transformation and sliding window


#This script contains a function that can be used to find the growth rate of
#a set of points grown on the Biotek plate reader.

#x = a vector of data points, e.g. c(0.1, 0.1, 0.2, 0.25, 0.3, ...)
#t = a vector of time points, e.g. c(0, 15, 30, 45, 60, ...)
#plottitle = the title to go on the plot of the fit
#int = the number of time steps that should be used to fit the line
#r2.cutoff = how stringent the fit needs to be --> 1 is the max.


d <- data.combined.gcr.wide

summary1 <- as.data.frame(matrix(NA, nrow=ncol(d)-1, ncol=5))
summary1[1] <- colnames(d)[2:ncol(d)]
colnames(summary1) <- c("Condition", "m", "r2", "lag", "note")

#source("C:/Users/ddebr/Dropbox/R/Functions/find_gr.R")
path= "C:/Users/ddebr/Dropbox/R/Projects/20210330 GCG superscript testing/20210303 Chemgen validation R2/Figures/GR/" #path where you would like to save figures


for(j in 2:ncol(d)){
  
  jpeg(filename = paste(path, if(FALSE %in% grepl("%", colnames(d)[j])){
    colnames(d)[j] #if there is a "%" in what will be the file name, replace with "percent"
  }else{
    sub( "%", " percent", colnames(d)[j])}, ".jpeg"))
  
  x = as.vector(d[[j]])
  t=d$time
  plottitle = colnames(d)[j]
  r2.cutoff = 0.995
  int = 5
  
  
  #findgr = function(x, t, plottitle, int=8, r2.cutoff=0.997) {
  x = as.numeric(x)
  n = length(x)
  mat = NULL
  max = c(0,0,0,NA)
  
  #are x and t the same length?
  if (length(x) != length(t)) {
    cat("Error: Your data and time are not the same length.\n")
    stop()
    
  }
  
  #is the line basically flat?
  fit = lm(x~t)
  m = abs(coefficients(fit)[[2]])
  if (m < 0.00001) {
    max=c(0,0,0,NA)
    lag=NA
    summary1$note[j-1] <- paste(summary1$note[j-1], "no growth")
    
    plot(t, log(x), pch=20, xlab="time", ylab="ln(OD600)", main=plottitle)
    mtext("no growth", side=3, line=-1, at=0, cex=0.8, adj=0)
    print(paste(j, "-- no growth"))
  }else{#if not, find a slope
    x[which(x <= 0)] = 0.001 	#transform values < 0
    
    x = log(x)
    for (i in 1:(n-int)) {
      fit = lm(x[i:(i+int)]~t[i:(i+int)])		#linear regression on log transformed data.
      m = coefficients(fit)[[2]]
      b = coefficients(fit)[[1]]
      r2 = summary(fit)$r.squared
      mat = rbind(mat, c(i, b, m, r2))
    }
    mat = mat[which(mat[,4] > r2.cutoff),] #only include slopes greater than the R2 cutoff.
    
    if(is.matrix(mat) && dim(mat)[1] != 0 && dim(mat)){ #DD modified 20210331 -- added this point to ignore data that is not within the R2 cutoff. before adding this, code would break due to having a matrix subset with no dimensions.
      
      
      max = mat[which.max(mat[,3]),]
      par(las=1, mar=c(5, 4, 4, 4) + 0.1)
      plot(t,x, type="n", pch=20, xlab="time", ylab="ln(OD600)", main=plottitle)
      usr.old = par("usr")
      
      #how long is this in exponential growth?
      fit.line = sapply(t, function(x) max[3]*x+max[2])
      resid = fit.line-x
      residper = abs(resid/fit.line)
      resid.mat = rbind(t, fit.line, x, resid, residper)
      resid.mat = resid.mat[,which(resid.mat[5,] < 0.05)]
      lag = resid.mat[1,1]
      
      
      mtext(paste("lag =",round(lag,2)),side=3, line=-3, at=0, cex=0.8, adj=0)
      time.in.exp = resid.mat[1,ncol(resid.mat)]-resid.mat[1,1]
      abline(v=lag, col="cadet blue", lty=2)
      abline(v=resid.mat[1,ncol(resid.mat)], col="cadet blue", lty=2)
      
      
      #plotting instantaneous growth rate   -- DD uncommented
      dx = diff(x)/(t[2]-t[1])
      par(usr=c(par("usr")[1:2],min(dx)*1.05, max(dx)*1.05))
      points(t[1:(length(t)-1)],dx, pch=18, type="o", col="dark grey", lty=1)
      axis(4, col.axis="dark grey", col.ticks="dark grey")
      mtext("delta(x)/delta(t)", side=4, line=3, col="dark grey", las=3)
      
      #plot
      par(usr=usr.old)
      points(t,x,pch=20)
      abline(lm(x[max[1]:(max[1]+int-1)]~t[max[1]:(max[1]+int-1)]), col="red", lty=2, lwd=2)
      points(t[max[1]:(max[1]+int-1)], x[max[1]:(max[1]+int-1)], col="red")
      mtext(paste("m =",round(max[3],3)), side=3, line=-1, at=0, cex=0.8, adj=0)
      mtext(paste("r2 =",round(max[4],4)), side=3, line=-2, at=0, cex=0.8, adj=0)
      #mtext(colnames(d)[j])
      
    }else{
      summary1$note[j-1] <- paste(summary1$note[j-1],"- below r2 cutoff")
    }
  }
  gr.data <- c("m"=max[3], "r2"=max[4], "lag"=lag)
  
  
  #gr.data<- findgr(x = as.vector(d[[i]]), t=d$time, plottitle = NA, r2.cutoff = 0.994, int = 3)
  summary1$m[j-1] <- gr.data[[1]]
  summary1$r2[j-1] <- gr.data[[2]]
  summary1$lag[j-1] <- gr.data[[3]]
  
  
  dev.off() #saves JPEG device.
}


# #@@@@@@@@@@@@@@@@@@@@@@@
# source("C:/Users/ddebr/Dropbox/R/Functions/find_gr.R")
#
# summary1 <- as.data.frame(matrix(NA, nrow=ncol(d)-1, ncol=4))
# summary1[1] <- colnames(d)[2:ncol(d)]
# colnames(summary1) <- c("Condition", "m", "r2", "lag")
#
# for(j in 2:ncol(d)){
#
#   gr.data<- findgr(x = as.vector(d[[j]]), t=d$time, plottitle = NA, r2.cutoff = 0.990, int = 3)
#   summary1$m[j-1] <- gr.data[[1]]
#   summary1$r2[j-1] <- gr.data[[2]]
#   summary1$lag[j-1] <- gr.data[[3]]
#
# }
#


#~~~~~~~~~~~~~~~~~
#add information from layout.blanks to plate.summary
# plate_summary$Strain <- NA
# plate_summary$Condition <- NA
#
#
# #write for() loop pulling condition and strain information from layout.blanks
# #copied and edited from combine layout.blank
# for(i in 1:nrow(plate_summary)){ #step through layout blanks row by row
#   for(j in 1:nrow(layout.blanks)){ #step through blank.out until you find a matching coordinate&&plate#
#     summ.coord <- plate_summary$coordinate[i]
#     layout.coord <- layout.blanks$Coordinate[j]
#
#     summ.plate.number <- as.character(plate_summary$plate[i])
#     layout.plate.number <- as.character(layout.blanks$Plate[j])
#
#
#     if(layout.coord == summ.coord && layout.plate.number == summ.plate.number){ #if they match, add OD600 value to the new column of row i
#       plate_summary$Strain[i] <- layout.blanks$Strain[j]
#       plate_summary$Condition[i] <- layout.blanks$Condition[j]
#     }
#   }
# }
#

#for(i in 1:length(dev.list())){dev.off()}
setwd("C:/Users/Derek Debrauske/Documents/R/GCG/")
setwd("C:/Users/ddebr/R/GCG/")
devtools::document()


usethis::use_package("openxlsx")


#  ______  __ __    ___        ___  ____   ___
# |      ||  |  |  /  _]      /  _]|    \ |   \
# |      ||  |  | /  [_      /  [_ |  _  ||    \
# |_|  |_||  _  ||    _]    |    _]|  |  ||  D  |
#   |  |  |  |  ||   [_     |   [_ |  |  ||     |
#   |  |  |  |  ||     |    |     ||  |  ||     |
#   |__|  |__|__||_____|    |_____||__|__||_____|