diff --git a/NAMESPACE b/NAMESPACE index 1b11067..51a2ed8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,7 +25,6 @@ export(vcf.stat) export(vst) export(vstPermutation) importFrom(R.utils,gzip) -importFrom(colorspace,rainbow_hcl) importFrom(colorspace,terrain_hcl) importFrom(data.table,fread) importFrom(grDevices,as.raster) @@ -45,6 +44,13 @@ importFrom(graphics,par) importFrom(graphics,polygon) importFrom(graphics,rasterImage) importFrom(graphics,text) +importFrom(parallel,clusterExport) +importFrom(parallel,detectCores) +importFrom(parallel,makeCluster) +importFrom(parallel,parApply) +importFrom(parallel,parLapply) +importFrom(parallel,parSapply) +importFrom(parallel,stopCluster) importFrom(qgraph,qgraph) importFrom(stats,as.dist) importFrom(stats,chisq.test) diff --git a/NEWS.md b/NEWS.md index f386063..4b65c60 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ # rCNV 1.3.0 (third update) +* parallelization enabled +* dupValidate function revised +* per site Fis added to deviant detection * vstPermutation function added * maf modified to remove multi-allelic sites * FIT correction added diff --git a/R/TMM_normalz.R b/R/TMM_normalz.R index ff01211..b399567 100644 --- a/R/TMM_normalz.R +++ b/R/TMM_normalz.R @@ -54,7 +54,7 @@ TMM <- function(obs, ref, logratioTrim=.3, sumTrim=0.05, Weighting=TRUE, Acutoff } -TMMex <- function(obs,ref, logratioTrim=.3, sumTrim=0.05, Weighting=TRUE, Acutoff=-1e10) +TMMex <- function(obs,ref, logratioTrim=.3, sumTrim=0.05, Weighting=TRUE, Acutoff=-1e10,cl) # Gordon Smyth # adopted from edgeR package (Robinson et al. 2010) { @@ -132,18 +132,18 @@ TMMex <- function(obs,ref, logratioTrim=.3, sumTrim=0.05, Weighting=TRUE, Acutof ## quantile normalization according to Robinson MD, and Oshlack A (2010) # qn1 -quantile_normalisation <- function(df,het.table=NULL,verbose=verbose){ - df_rank <- apply(df,2,rank,ties.method="min") +quantile_normalisation <- function(df,het.table=NULL,verbose=verbose,cl){ + df_rank <- parApply(cl=cl,df,2,rank,ties.method="min") df_sorted <- data.frame(apply(df, 2, sort,na.last=TRUE)) - df_mean <- apply(df_sorted, 1, mean,na.rm=TRUE) + df_mean <- parApply(cl=cl,df_sorted, 1, mean,na.rm=TRUE) if(!is.null(het.table)){ het.table<-het.table[,-c(1:4)] } if(verbose){ message("\ncalculating normalized depth") - df_final <- lapply_pb(1:ncol(df_rank), index_to_mean, my_mean=df_mean,indx=df_rank,al=het.table) + df_final <- parLapply(cl=cl,1:ncol(df_rank), index_to_mean, my_mean=df_mean,indx=df_rank,al=het.table) } else { - df_final <- lapply(1:ncol(df_rank), index_to_mean, my_mean=df_mean,indx=df_rank,al=het.table) + df_final <- parLapply(cl=cl,1:ncol(df_rank), index_to_mean, my_mean=df_mean,indx=df_rank,al=het.table) } return(df_final) } @@ -309,7 +309,7 @@ cpm.normal <- function(het.table, method=c("MedR","QN","pca","TMM","TMMex"), message("calculating normalization factor") pb <- txtProgressBar(min = 0, max = pb_Total, width = 50, style = 3) } - # tdep<-rCNV:::apply_pb(het.table[,-c(1:4)],2,function(tmp){ + # tdep<-parApply(cl=cl,(het.table[,-c(1:4)],2,function(tmp){ y1 <- y2 <- matrix(NA_integer_, dm[1], pb_Total) for(i in seq_len(pb_Total)){ if (verbose) setTxtProgressBar(pb, i) @@ -353,7 +353,7 @@ cpm.normal <- function(het.table, method=c("MedR","QN","pca","TMM","TMMex"), out <- paste0(y1, ",", y2) attributes(out) <- attributes(tdep) } else if(method=="QN"){ - out <- do.call(cbind,quantile_normalisation(tdep,het.table,verbose=verbose)) + out <- do.call(cbind,quantile_normalisation(tdep,het.table,verbose=verbose,cl=cl)) } else if(method=="pca"){ if(verbose){message("\ncalculating normalized depth")} new.mat <- t(tdep) ### check the direction to confirm if this step need to be done diff --git a/R/bias.det.R b/R/bias.det.R index c738105..a79da56 100644 --- a/R/bias.det.R +++ b/R/bias.det.R @@ -61,6 +61,7 @@ get.pvals<-function(x,df,p.cal){ #' \code{hetTgen} (non-normalized) #' @param x.norm a data frame of normalized allele coverage, output of #' \code{cpm.normal}. If not provided, calculated using \code{X}. +#' @param Fis numeric. Inbreeding coefficient calculated using \code{h.zygosity()} function #' @param method character. method to be used for normalization #' (see \code{cpm.normal} details). Default \code{TMM} #' @param logratioTrim numeric. percentage value (0 - 1) of variation to be @@ -73,9 +74,11 @@ get.pvals<-function(x,df,p.cal){ #' @param plot.allele.cov logical, plot comparative plots of allele depth #' coverage in homozygotes and heterozygotes #' @param verbose logical, whether to print progress +#' @param parallel logical. whether to parallelize the process #' @param \dots further arguments to be passed to \code{plot} #' #' @importFrom stats pchisq pnorm na.omit +#' @importFrom parallel parApply detectCores parLapply stopCluster #' #' @details #' Allele information generated here are individual SNP based and presents the @@ -105,43 +108,19 @@ get.pvals<-function(x,df,p.cal){ #' AI<-allele.info(ADtable,x.norm=ADnorm)} #' #' @export -allele.info<-function(X,x.norm=NULL,method=c("MedR","QN","pca","TMM","TMMex"),logratioTrim = 0.3,sumTrim = 0.05,Weighting = TRUE,Acutoff = -1e+10,plot.allele.cov=TRUE,verbose = TRUE,...){ +allele.info<-function(X,x.norm=NULL,Fis,method=c("MedR","QN","pca","TMM","TMMex"),logratioTrim = 0.3,sumTrim = 0.05,Weighting = TRUE,Acutoff = -1e+10,plot.allele.cov=TRUE,verbose = TRUE,parallel=FALSE,...){ method=match.arg(method) if(is.null(x.norm)){ x.norm<-cpm.normal(X,method=method,logratioTrim=logratioTrim,sumTrim = sumTrim,Weighting = Weighting,Acutoff = Acutoff,verbose = verbose) } if(!inherits(x.norm,"list")){x.norm<-list(AD=x.norm)} if(inherits(x.norm,"list")){x.norm<-x.norm$AD} - if(verbose){ - message("calculating probability values of alleles") - p.cal<-apply_pb(x.norm[,-c(1:4)],1,function(snp1){ - if(is.character(unname(unlist(snp1[1])))){ - y<-data.frame(stringr::str_split_fixed(snp1,",",n=2L)) - y[,1]<-as.integer(y[,1]) - y[,2]<-as.integer(y[,2])} else {y<-snp1} - rs<-rowSums(y) - rs[rs==0L]<-NA - cv<-sd(unlist(rs),na.rm = TRUE)/mean(unlist(rs),na.rm = TRUE) - rr1<-y[,2]/rs - snp1het<-y[-which(rr1 == 0 | rr1 == 1 | is.na(rr1)==TRUE),] - homalt<-sum(rr1==1,na.rm=TRUE) - homref<-sum(rr1==0,na.rm=TRUE) - covrefhomo<-sum(y[c(rr1 == 0,na.rm = TRUE),],na.rm = TRUE) - covalthomo<-sum(y[c(rr1 == 1,na.rm = TRUE),],na.rm = TRUE) - covalt<-sum(y[,2],na.rm = TRUE) - covref<-sum(y[,1],na.rm = TRUE) - NHet<-nrow(snp1het) - if(NHet>3){ - p.all<-(covalt/(NHet+(2*homalt)))/((covalt/(NHet+(2*homalt)))+(covref/(NHet+(2*homref)))) - p.sum<-sum(snp1het[,2])/sum(snp1het) - ll <-data.frame(p.all,p.sum,mean.a.homo=covalthomo/(2*homalt),mean.r.homo=covrefhomo/(2*homref),mean.a.het=sum(snp1het[,2])/NHet,mean.r.het=sum(snp1het[,1])/NHet,cv=cv) - } else { - ll<-NA - } - return(ll) - }) - } else { - p.cal<-apply(x.norm[,-c(1:4)],1,function(snp1){ + + if(parallel){ + numCores<-detectCores()-1 + cl<-makeCluster(cl) + + p.cal<-parApply(cl=cl,x.norm[,-c(1:4)],1,function(snp1){ if(is.character(unname(unlist(snp1[1])))){ y<-data.frame(stringr::str_split_fixed(snp1,",",n=2L)) y[,1]<-as.integer(y[,1]) @@ -167,121 +146,66 @@ allele.info<-function(X,x.norm=NULL,method=c("MedR","QN","pca","TMM","TMMex"),lo } return(ll) }) - } - if(is.list(p.cal)){ - p.cal<-do.call(rbind,p.cal) - } else { - p.cal<-t(p.cal) - } - p.cal[p.cal=="NaN"]<-0 - if(plot.allele.cov){ - p.list<-list(...) - if(is.null(p.list$pch)) p.list$pch=19 - if(is.null(p.list$cex)) p.list$cex=0.6 - if(is.null(p.list$col)) p.list$col<-makeTransparent(colorspace::heat_hcl(12,h=c(0,-100),c=c(40,80),l=c(75,40),power=1)[11]) - if(is.null(p.list$lcol)) p.list$lcol="tomato" - par(mfrow=c(2,2)) - par(mar=c(4,5,2,2)) - plot(p.cal$mean.a.homo,p.cal$mean.r.homo,pch=p.list$pch,cex=p.list$cex, - col=p.list$col,xlab="Mean coverage of \nalt. allele in homozygotes", - ylab="Mean coverage of \n ref. allele in homozygotes",cex.lab=0.8) - abline(0,1,col=p.list$lcol) - plot(p.cal$mean.a.het,p.cal$mean.r.het,pch=p.list$pch,cex=p.list$cex, - col=p.list$col,xlab="Mean coverage of \nalt. allele in heterozygotes", - ylab="Mean coverage of \n ref. allele in heterozygotes",cex.lab=0.8) - abline(0,1,col=p.list$lcol) - plot(p.cal$mean.a.het,p.cal$mean.a.homo,pch=p.list$pch,cex=p.list$cex, - col=p.list$col,xlab="Mean coverage of \nalt. allele in heterozygotes", - ylab="Mean coverage of \n alt. allele in homozygotes",cex.lab=0.8) - abline(0,1,col=p.list$lcol) - plot(p.cal$mean.r.het,p.cal$mean.r.homo,pch=p.list$pch,cex=p.list$cex, - col=p.list$col,xlab="Mean coverage of \nref. allele in heterozygotes", - ylab="Mean coverage of \n ref. allele in homozygotes",cex.lab=0.8) - abline(0,1,col=p.list$lcol) - par(mfrow=c(1,1)) - } - if(verbose){ - message("calculating chi-square significance") - pvals<-lapply_pb(1:nrow(X),get.pvals,df=X,p.cal=p.cal) } else { - pvals<-lapply(1:nrow(X),get.pvals,df=X,p.cal=p.cal) + if(verbose){ + message("calculating probability values of alleles") + p.cal<-apply_pb(x.norm[,-c(1:4)],1,function(snp1){ + if(is.character(unname(unlist(snp1[1])))){ + y<-data.frame(stringr::str_split_fixed(snp1,",",n=2L)) + y[,1]<-as.integer(y[,1]) + y[,2]<-as.integer(y[,2])} else {y<-snp1} + rs<-rowSums(y) + rs[rs==0L]<-NA + cv<-sd(unlist(rs),na.rm = TRUE)/mean(unlist(rs),na.rm = TRUE) + rr1<-y[,2]/rs + snp1het<-y[-which(rr1 == 0 | rr1 == 1 | is.na(rr1)==TRUE),] + homalt<-sum(rr1==1,na.rm=TRUE) + homref<-sum(rr1==0,na.rm=TRUE) + covrefhomo<-sum(y[c(rr1 == 0,na.rm = TRUE),],na.rm = TRUE) + covalthomo<-sum(y[c(rr1 == 1,na.rm = TRUE),],na.rm = TRUE) + covalt<-sum(y[,2],na.rm = TRUE) + covref<-sum(y[,1],na.rm = TRUE) + NHet<-nrow(snp1het) + if(NHet>3){ + p.all<-(covalt/(NHet+(2*homalt)))/((covalt/(NHet+(2*homalt)))+(covref/(NHet+(2*homref)))) + p.sum<-sum(snp1het[,2])/sum(snp1het) + ll <-data.frame(p.all,p.sum,mean.a.homo=covalthomo/(2*homalt),mean.r.homo=covrefhomo/(2*homref),mean.a.het=sum(snp1het[,2])/NHet,mean.r.het=sum(snp1het[,1])/NHet,cv=cv) + } else { + ll<-NA + } + return(ll) + }) + } else { + p.cal<-apply(x.norm[,-c(1:4)],1,function(snp1){ + if(is.character(unname(unlist(snp1[1])))){ + y<-data.frame(stringr::str_split_fixed(snp1,",",n=2L)) + y[,1]<-as.integer(y[,1]) + y[,2]<-as.integer(y[,2])} else {y<-snp1} + rs<-rowSums(y) + rs[rs==0]<-NA + cv<-sd(unlist(rs),na.rm = TRUE)/mean(unlist(rs),na.rm = TRUE) + rr1<-y[,2]/rs + snp1het<-y[-which(rr1 == 0 | rr1 == 1 | is.na(rr1)==TRUE),] + homalt<-sum(rr1==1,na.rm=TRUE) + homref<-sum(rr1==0,na.rm=TRUE) + covrefhomo<-sum(y[c(rr1 == 0,na.rm = TRUE),],na.rm = TRUE) + covalthomo<-sum(y[c(rr1 == 1,na.rm = TRUE),],na.rm = TRUE) + covalt<-sum(y[,2],na.rm = TRUE) + covref<-sum(y[,1],na.rm = TRUE) + NHet<-nrow(snp1het) + if(NHet>3){ + p.all<-(covalt/(NHet+(2*homalt)))/((covalt/(NHet+(2*homalt)))+(covref/(NHet+(2*homref)))) + p.sum<-sum(snp1het[,2])/sum(snp1het) + ll <-data.frame(p.all,p.sum,mean.a.homo=covalthomo/(2*homalt),mean.r.homo=covrefhomo/(2*homref),mean.a.het=sum(snp1het[,2])/NHet,mean.r.het=sum(snp1het[,1])/NHet,cv=cv) + } else { + ll<-NA + } + return(ll) + }) + } } - pvals<-do.call(rbind,pvals) - pvals<-cbind(X[,1:3],pvals) - pvals<-na.omit(pvals) - ht<-sig.hets(pvals,plot = FALSE, verbose = verbose) - pvals<-data.frame(pvals,eH.pval=ht[,"eH.pval"],eH.delta=ht[,"eH.delta"],cv=na.omit(p.cal[,"cv"])) - - return(pvals) -} - - - -allele.info_old<-function(X,x.norm=NULL,method=c("MedR","QN","pca","TMM","TMMex"),logratioTrim = 0.3,sumTrim = 0.05,Weighting = TRUE,Acutoff = -1e+10,plot.allele.cov=TRUE,verbose = TRUE,...){ - method=match.arg(method) - if(is.null(x.norm)){ - x.norm<-cpm.normal(X,method=method,logratioTrim=logratioTrim,sumTrim = sumTrim,Weighting = Weighting,Acutoff = Acutoff,verbose = verbose) - } - if(!inherits(x.norm,"list")){x.norm<-list(AD=x.norm)} - if(inherits(x.norm,"list")){x.norm<-x.norm$AD} - if(verbose){ - message("calculating probability values of alleles") - p.cal<-apply_pb(x.norm[,-c(1:4)],1,function(snp1){ - if(is.character(unname(unlist(snp1[1])))){ - y<-data.frame(stringr::str_split_fixed(snp1,",",n=2L)) - y[,1]<-as.integer(y[,1]) - y[,2]<-as.integer(y[,2])} else {y<-snp1} - rs<-rowSums(y) - rs[rs==0L]<-NA - cv<-sd(unlist(y),na.rm = TRUE)/mean(unlist(y),na.rm = TRUE) - rr1<-y[,2]/rs - snp1het<-y[-which(rr1 == 0 | rr1 == 1 | is.na(rr1)==TRUE),] - homalt<-sum(rr1==1,na.rm=TRUE) - homref<-sum(rr1==0,na.rm=TRUE) - covrefhomo<-sum(y[c(rr1 == 0,na.rm = TRUE),],na.rm = TRUE) - covalthomo<-sum(y[c(rr1 == 1,na.rm = TRUE),],na.rm = TRUE) - covalt<-sum(y[,2],na.rm = TRUE) - covref<-sum(y[,1],na.rm = TRUE) - NHet<-nrow(snp1het) - if(NHet>3){ - p.all<-(covalt/(NHet+(2*homalt)))/((covalt/(NHet+(2*homalt)))+(covref/(NHet+(2*homref)))) - p.sum<-sum(snp1het[,2])/sum(snp1het) - ll <-data.frame(p.all,p.sum,mean.a.homo=covalthomo/(2*homalt),mean.r.homo=covrefhomo/(2*homref),mean.a.het=sum(snp1het[,2])/NHet,mean.r.het=sum(snp1het[,1])/NHet,cv=cv) - } else { - ll<-NA - } - return(ll) - }) - } else { - p.cal<-apply(x.norm[,-c(1:4)],1,function(snp1){ - if(is.character(unname(unlist(snp1[1])))){ - y<-data.frame(stringr::str_split_fixed(snp1,",",n=2L)) - y[,1]<-as.integer(y[,1]) - y[,2]<-as.integer(y[,2])} else {y<-snp1} - rs<-rowSums(y) - rs[rs==0]<-NA - cv<-sd(unlist(y),na.rm = TRUE)/mean(unlist(y),na.rm = TRUE) - rr1<-y[,2]/rs - snp1het<-y[-which(rr1 == 0 | rr1 == 1 | is.na(rr1)==TRUE),] - homalt<-sum(rr1==1,na.rm=TRUE) - homref<-sum(rr1==0,na.rm=TRUE) - covrefhomo<-sum(y[c(rr1 == 0,na.rm = TRUE),],na.rm = TRUE) - covalthomo<-sum(y[c(rr1 == 1,na.rm = TRUE),],na.rm = TRUE) - covalt<-sum(y[,2],na.rm = TRUE) - covref<-sum(y[,1],na.rm = TRUE) - NHet<-nrow(snp1het) - if(NHet>3){ - p.all<-(covalt/(NHet+(2*homalt)))/((covalt/(NHet+(2*homalt)))+(covref/(NHet+(2*homref)))) - p.sum<-sum(snp1het[,2])/sum(snp1het) - ll <-data.frame(p.all,p.sum,mean.a.homo=covalthomo/(2*homalt),mean.r.homo=covrefhomo/(2*homref),mean.a.het=sum(snp1het[,2])/NHet,mean.r.het=sum(snp1het[,1])/NHet,cv=cv) - } else { - ll<-NA - } - return(ll) - }) - } if(is.list(p.cal)){ p.cal<-do.call(rbind,p.cal) } else { @@ -315,16 +239,21 @@ allele.info_old<-function(X,x.norm=NULL,method=c("MedR","QN","pca","TMM","TMMex" par(mfrow=c(1,1)) } - if(verbose){ - message("calculating chi-square significance") - pvals<-lapply_pb(1:nrow(X),get.pvals,df=X,p.cal=p.cal) + if(parallel){ + pvals<-parLapply(1:nrow(X),get.pvals,df=X,p.cal=p.cal,cl=cl) + stopCluster(cl) } else { - pvals<-lapply(1:nrow(X),get.pvals,df=X,p.cal=p.cal) + if(verbose){ + message("calculating chi-square significance") + pvals<-lapply_pb(1:nrow(X),get.pvals,df=X,p.cal=p.cal) + } else { + pvals<-lapply(1:nrow(X),get.pvals,df=X,p.cal=p.cal) + } } pvals<-do.call(rbind,pvals) pvals<-cbind(X[,1:3],pvals) pvals<-na.omit(pvals) - ht<-sig.hets(pvals,plot = FALSE, verbose = verbose) + ht<-sig.hets(pvals,Fis,plot = FALSE, verbose = verbose) pvals<-data.frame(pvals,eH.pval=ht[,"eH.pval"],eH.delta=ht[,"eH.delta"],cv=na.omit(p.cal[,"cv"])) return(pvals) diff --git a/R/detect.R b/R/detect.R index 3b6dd7f..4dbbb69 100644 --- a/R/detect.R +++ b/R/detect.R @@ -6,19 +6,19 @@ chisq_test<-function(ob, ex){ } #1 calculate expected proportions of heterozygotes from real data -ex.prop<-function(rs,method=c("chi.sq","fisher")){ +ex.prop<-function(rs,Fis,method=c("chi.sq","fisher")){ n<-unlist(c(rs[4])) p<-(rs[1]+rs[2]/2)/n ob<-unlist(c(rs[1:3])) - eX<-unlist(c((p^2) * n,2*p*(1-p) * n,((1-p)^2) * n)) - delta <- rs[2]-(2*p*(1-p) * n) + eX<-unlist(c((p^2*(1-Fis)+p*Fis) * n,2*p*(1-p)*(1-Fis) * n,((1-p)^2*(1-Fis)+(1-p)*Fis) * n)) + delta <- rs[2]-(2*p*(1-p) * n *(1-Fis)) stat<-match.arg(method) pval<-switch(stat,fisher=suppressWarnings(fisher.test(cbind(ob,eX),workspace = 2e8))$p.value,chi.sq=suppressWarnings(chisq_test(ob,eX))) return(c(eX/n,pval,delta)) } #1 pvals for sig.hets when AD table is provided -get.eHpvals<-function(df,method=c("chi.sq","fisher")){ +get.eHpvals<-function(df,Fis,method=c("chi.sq","fisher")){ snp1<-df[-c(1:4)] y<-data.frame(stringr::str_split_fixed(snp1,",",n=2L)) y[,1]<-as.integer(y[,1]);y[,2]<-as.integer(y[,2]) @@ -36,8 +36,8 @@ get.eHpvals<-function(df,method=c("chi.sq","fisher")){ n<-Nsamp p<-(rs[1]+rs[2]/2)/n ob<-c(homref,NHet,homalt) - eX<-unlist(c((p^2) * n,2*p*(1-p) * n,((1-p)^2) * n)) - delta <- rs[2]-(2*p*(1-p) * n) + eX<-unlist(c((p^2*(1-Fis)+p*Fis) * n,2*p*(1-p)*(1-Fis) * n,((1-p)^2*(1-Fis)+(1-p)*Fis) * n)) + delta <- rs[2]-(2*p*(1-p) * n *(1-Fis)) stat<-match.arg(method) pval<-switch(stat,fisher=suppressWarnings(fisher.test(cbind(ob,eX),workspace = 2e8))$p.value,chi.sq=suppressWarnings(chisq_test(ob,eX))) ll<-c(medRatio,propHomAlt,propHet,eX/n,pval,delta) @@ -55,6 +55,7 @@ get.eHpvals<-function(df,method=c("chi.sq","fisher")){ #' #' @param a.info allele info table generated from filtered vcfs using the #' function \code{allele.info} or allele depth table generated from \code{hetTgen} +#' @param Fis numeric. Inbreeding coefficient calculated using \code{h.zygosity()} function #' @param method character. Method for testing significance. Fisher exact test #' (\code{fisher}) or Chi-square test (\code{chi.sq}) #' @param plot logical. Whether to plot the identified duplicated snps with @@ -65,24 +66,24 @@ get.eHpvals<-function(df,method=c("chi.sq","fisher")){ #' @return A matrix of expected heterozygote proportions from the observed #' data with p-value indicating significance of deviation. #' -#' @author Piyal Karunarathne, Pascal Milesi, Klaus Schliep +#' @author Piyal Karunarathne, Pascal Milesi, Klaus Schliep, Qiujie Zhou #' #' @examples #' \dontrun{data(alleleINF) #' AI <- alleleINF -#' duplicates<-sig.hets(AI,plot=TRUE)} +#' duplicates<-sig.hets(AI,plot=TRUE,Fis=0.1)} #' #' @importFrom grDevices col2rgb rgb #' @importFrom stats fisher.test median quantile rbinom sd smooth.spline #' chisq.test #' @importFrom utils setTxtProgressBar txtProgressBar -#' @importFrom colorspace rainbow_hcl +#' #' @export -sig.hets<-function(a.info,method=c("chi.sq","fisher"),plot=TRUE,verbose=TRUE,...){ +sig.hets<-function(a.info,Fis,method=c("chi.sq","fisher"),plot=TRUE,verbose=TRUE,...){ if(!any(colnames(a.info)=="NHomRef")){ if(verbose){message("assessing excess of heterozygotes") - df<-apply_pb(a.info,1,get.eHpvals,method=method) - } else {df<-apply(a.info,1,get.eHpvals,method=method)} + df<-apply(a.info,1,get.eHpvals,method=method,Fis=Fis) + } else {df<-apply(a.info,1,get.eHpvals,method=method,Fis=Fis)} df<-data.frame(do.call(rbind,df)) colnames(df)<-c("medRatio","propHomAlt","propHet","p2","het","q2","eH.pval","eH.delta") #df<-na.omit(df) @@ -92,9 +93,9 @@ sig.hets<-function(a.info,method=c("chi.sq","fisher"),plot=TRUE,verbose=TRUE,... method<-match.arg(method) if(verbose){ message("assessing excess of heterozygotes") - df<-data.frame(t(apply_pb(d,1,ex.prop,method=method))) + df<-data.frame(t(apply(d,1,ex.prop,method=method,Fis=Fis))) } else { - df<-data.frame(t(apply(d,1,ex.prop,method=method))) + df<-data.frame(t(apply(d,1,ex.prop,method=method,Fis=Fis))) } colnames(df)<-c("p2","het","q2","eH.pval","eH.delta") df$propHomAlt <- a.info$propHomAlt @@ -112,7 +113,7 @@ sig.hets<-function(a.info,method=c("chi.sq","fisher"),plot=TRUE,verbose=TRUE,... if(is.null(l$pch)) l$pch=19 if(is.null(l$xlim)) l$xlim=c(0,1) if(is.null(l$ylim)) l$ylim=c(0,1) - if(is.null(l$col)) cols<-makeTransparent(rainbow_hcl(2),alpha=0.3) else cols<-makeTransparent(l$col,alpha=0.3) + if(is.null(l$col)) cols<-makeTransparent(colorspace::rainbow_hcl(2),alpha=0.3) else cols<-makeTransparent(l$col,alpha=0.3) d$Color <- cols[1] d$Color [which(df$dup.stat=="deviant")]<- cols[2]#& df$delta > 0 @@ -137,7 +138,6 @@ sig.hets<-function(a.info,method=c("chi.sq","fisher"),plot=TRUE,verbose=TRUE,... #' @param \dots other graphical parameters to be passed to the function #' \code{plot} #' -#' @importFrom colorspace rainbow_hcl #' #' @return Returns no value, only plots proportion of heterozygotes vs allele #' median ratio separated by duplication status @@ -178,6 +178,7 @@ dup.plot<-function(ds,...){ #' See details. #' #' @param data data frame of the output of \code{allele.info} +#' @param Fis numeric. Inbreeding coefficient calculated using \code{h.zygosity()} function #' @param test character. type of test to be used for significance. See details #' @param intersection logical, whether to use the intersection of the methods #' specified in \code{test} (if more than one) @@ -208,14 +209,73 @@ dup.plot<-function(ds,...){ #' all allele combinations (\code{z.all, chi.all}) and the assumption of no #' probe bias p=0.5 (\code{z.05, chi.05}) #' -#' @author Piyal Karunarathne +#' @author Piyal Karunarathne Qiujie Zhou #' #' @examples #' \dontrun{data(alleleINF) -#' DD<-dupGet(alleleINF)} +#' DD<-dupGet(alleleINF,Fis=0.1,test=c("z.05","chi.05"))} #' #' @export -dupGet<-function(data,test=c("z.het","z.05","z.all","chi.het","chi.05","chi.all"),intersection=FALSE,method=c("fisher","chi.sq"),plot=TRUE,verbose=TRUE,...){ +dupGet<-function(data,Fis,test=c("z.het","z.05","z.all","chi.het","chi.05","chi.all"),intersection=FALSE,method=c("fisher","chi.sq"),plot=TRUE,verbose=TRUE,...){ + #data check + data<-as.data.frame(data) + if(!any(colnames(data)=="propHet")){ + stop("please provide the data with the output of allele.info()") + } else { + if(!(any(colnames(data)=="eH.pval"))) { + ht<-sig.hets(data,Fis=Fis,plot=F,verbose=verbose) + } else { + ht <-data[,c("eH.pval","eH.delta")] + ht$dup.stat<-"non-deviant" + ht$dup.stat[which(ht$eH.pval < 0.05/nrow(ht) & ht$eH.delta > 0 )]<-"deviant" + } + + test<-match.arg(test,several.ok = TRUE) + if(length(test)==6){ + if(verbose){cat(paste0("categorizing deviant SNPs with \n excess of heterozygotes ","z.all & chi.all"))} + pp<-data[,c("z.all","chi.all")] + } else { + pp<-data.frame(data[,test]) + if(verbose){cat(paste0("categorizing deviant SNPs with \n", " excess of heterozygotes \n ",paste0(unlist(test),collapse = "\n ")))} + } + if(intersection){ + df<-matrix(NA,nrow = nrow(pp),ncol = ncol(pp)) + for(i in 1:ncol(pp)){ + df[which(pp[,i]<0.05/nrow(pp)),i]<-1 + } + pp$dup.stat<-"non-deviant" + pp$dup.stat[which(rowSums(df)==(ncol(pp)-1))]<-"deviant" + } else { + pp$dup.stat<-"non-deviant" + for(i in 1:ncol(pp)){ + pp$dup.stat[pp[,i]<0.05/nrow(pp)]<-"deviant" + } + } + pp$dup.stat[which(ht$dup.stat=="deviant")]<-"deviant" + pp<-data.frame(data[,1:10],eH.pval=ht[,"eH.pval"],eH.delta=ht[,"eH.delta"],dup.stat=pp$dup.stat) + + if(plot){ + l<-list(...) + if(is.null(l$cex)) l$cex=0.2 + if(is.null(l$pch)) l$pch=19 + if(is.null(l$xlim)) l$xlim=c(0,1) + if(is.null(l$ylim)) l$ylim=c(0,1) + if(is.null(l$alpha)) l$alpha=0.3 + if(is.null(l$col)) l$col<-makeTransparent(c("tomato","#2297E6FF"))#colorspace::terrain_hcl(12,c=c(65,0),l=c(45,90),power=c(1/2,1.5))[2] + Color <- rep(l$col[2],nrow(pp)) + Color[pp$dup.stat=="deviant"]<- l$col[1] + plot(pp$medRatio~pp$propHet, pch=l$pch, cex=l$cex,col=Color,xlim=l$xlim,ylim=l$ylim,frame=F, + ylab="Allele Median Ratio",xlab="Proportion of Heterozygotes") + legend("bottomright", c("deviants","non-deviants"), col = makeTransparent(l$col,alpha=1), pch=l$pch, + cex = 0.8,inset=c(0,1), xpd=TRUE, horiz=TRUE, bty="n") + } + } + return(pp) +} + + +#' @noRd +dupGet_0<-function(data,test=c("z.het","z.05","z.all","chi.het","chi.05","chi.all"),intersection=FALSE,method=c("fisher","chi.sq"),plot=TRUE,verbose=TRUE,...){ #data check data<-as.data.frame(data) if(!any(colnames(data)=="propHet")){ @@ -272,6 +332,7 @@ dupGet<-function(data,test=c("z.het","z.05","z.all","chi.het","chi.05","chi.all" return(pp) } + #' Find CNVs from deviants #' #' Categorize deviant and non-deviant into "singlets" and "duplicates" based on the statistical approaches specified by the user. @@ -317,12 +378,14 @@ dupGet<-function(data,test=c("z.het","z.05","z.all","chi.het","chi.05","chi.all" #' about the threshold values. #' #' @param WGS logical. test parameter. See details +#' +#' @details #' \code{WGS} is a test parameter to include or exclude coefficient of variance #' (cv) in kmeans. For data sets with more homogeneous depth distribution, #' excluding cv improves CNV detection. If you're not certain about this, use #' \code{TRUE} which is the default. -#' @author Piyal Karunarathne +#' @author Piyal Karunarathne Qiujie Zhou #' #' @examples #' \dontrun{data(alleleINF) @@ -400,7 +463,7 @@ cnv<-function(data,test=c("z.het","z.05","z.all","chi.het","chi.05","chi.all"),f } #' @noRd -cnv_old<-function(data,test=c("z.het","z.05","z.all","chi.het","chi.05","chi.all"),filter=c("intersection","kmeans"),ft.threshold=0.05,plot=TRUE,verbose=TRUE,...){ +cnv_0<-function(data,test=c("z.het","z.05","z.all","chi.het","chi.05","chi.all"),filter=c("intersection","kmeans"),ft.threshold=0.05,plot=TRUE,verbose=TRUE,...){ #data check data<-as.data.frame(data) data$z.het.sum<-abs(data$z.het.sum) diff --git a/R/raw_process.R b/R/raw_process.R index 0db9a49..4f45963 100644 --- a/R/raw_process.R +++ b/R/raw_process.R @@ -41,6 +41,7 @@ lapply_pb <- function(X, FUN, ...) res } + sapply_pb <- function(X, FUN, ...) { env <- environment() @@ -133,63 +134,52 @@ gg<-function(x){ #' a bi-allelic matrix when loci are multi-allelic #' #' @param h.table allele depth table generated from the function \code{hetTgen} -#' @param drop.multi logical. If TRUE, all the sites with more than two allele will be -#' dropped. If FALSE \code{default}, the the two alleles with the highest allele -#' frequencies will be retained. See details for more information. -#' @param AD logical. If TRUE, an allele depth table similar to \code{hetTgen} -#' output will be returned; If \code{FALSE}, individual AD values per SNP will be +#' @param AD logical. If TRUE a allele depth table similar to \code{hetTgen} +#' output will be returns; If \code{FALSE}, individual AD values per SNP will be #' returned in a list. #' @param verbose logical. Show progress +#' @param parallel logical. whether to parallelize the process +#' @importFrom parallel parApply detectCores parLapply stopCluster makeCluster #' #' @return A data frame or a list of minimum allele frequency removed allele depth #' -#' @details -#' This function either drops all sites with more than two alleles or keep alleles \code{drop.multi=TRUE} -#' with the highest allele frequencies in the case of multi-allelic sites \code{drop.multi=FALSE}. -#' Note that, in the latter case, all sites will be retained, essentially assuming that multi-allelic sites -#' are also bi-allelic by dropping the alleles with minimum frequencies. -#' This function can only be used on the output of the \code{hetTgen()}. If you need to remove the multi-allelic sites -#' from the vcf, use \code{rCNV:::non_bi_rm()} function. We recommend using other programs (e.g., vcftools, GATK, etc.) for removing multiallelic sites faster. -#' -#' #' @author Piyal Karunarathne #' #' @examples #' \dontrun{mf<-maf(ADtable)} #' #' @export -maf<-function(h.table,drop.multi=FALSE,AD=TRUE,verbose=TRUE){ - h.table<-data.frame(h.table) - warning(paste0("There are ",sum(nchar(h.table$ALT)>1),"multi-allelic sites")) - if(drop.multi){ - h.table<-h.table[-which(nchar(h.table$ALT)>1),] # drop multi-allelic sites - message(paste0("\n",sum(nchar(h.table$ALT)>1),"multi-allelic sites were removed")) - return(h.table) - } else { # drop alleles with maf when multi-allelic - htab<-h.table[,-c(1:4)] +maf<-function(h.table,AD=TRUE,verbose=TRUE,parallel=FALSE){ + htab<-h.table[,-c(1:4)] + if(parallel){ + numCores<-detectCores()-1 + cl<-makeCluster(numCores) + glt<-parApply(cl=cl,htab,1,function(X){ + gg<-do.call(rbind,lapply(X,function(x){as.numeric(strsplit(x,",")[[1]])})) + while(ncol(gg)>2){gg<-gg[,-which.min(colMeans(proportions(gg,1),na.rm=T))]} + if(AD){return(paste0(gg[,1],",",gg[,2]))}else{return(gg)} + }) + stopCluster(cl) + } else { if(verbose){ - glt<-apply_pb(htab,1,function(X){suppressWarnings({ + glt<-apply_pb(htab,1,function(X){ gg<-do.call(rbind,lapply(X,function(x){as.numeric(strsplit(x,",")[[1]])})) while(ncol(gg)>2){gg<-gg[,-which.min(colMeans(proportions(gg,1),na.rm=T))]} if(AD){return(paste0(gg[,1],",",gg[,2]))}else{return(gg)} }) - }) }else{ - glt<-apply(htab,1,function(X){ + glt<-apply_pb(htab,1,function(X){ gg<-do.call(rbind,lapply(X,function(x){as.numeric(strsplit(x,",")[[1]])})) if(ncol(gg)>2)gg<-gg[,-which.min(colMeans(proportions(gg,1),na.rm=T))] if(AD){return(paste0(gg[,1],",",gg[,2]))}else{return(gg)} }) } - glt<-data.frame(h.table[,1:4],t(glt)) - colnames(glt)<-colnames(h.table) - message("Minimum frequency alleles were removed in multi-allelic sites") - return(glt) } + glt<-data.frame(h.table[,1:4],t(glt)) + colnames(glt)<-colnames(h.table) + return(glt) } - - #' Import VCF file #' #' Function to import raw single and multi-sample VCF files. @@ -226,6 +216,8 @@ readVCF <- function(vcf.file.path,verbose=FALSE){ #' \code{GT-012}:genotype in 012 format, \code{GT-AB}:genotype in AB format. #' Default \code{AD}, See details. #' @param verbose logical. whether to show the progress of the analysis +#' @param parallel logical. whether to parallelize the process +#' @importFrom parallel parApply detectCores parLapply stopCluster makeCluster #' #' @details If you generate the depth values for allele by sample using GatK #' VariantsToTable option, use only -F CHROM -F POS -GF AD flags to generate @@ -237,78 +229,97 @@ readVCF <- function(vcf.file.path,verbose=FALSE){ #' #' @author Piyal Karunarathne, Klaus Schliep #' @importFrom utils setTxtProgressBar txtProgressBar +#' @importFrom parallel parSapply makeCluster clusterExport parLapply #' @examples #' vcf.file.path <- paste0(path.package("rCNV"), "/example.raw.vcf.gz") #' vcf <- readVCF(vcf.file.path=vcf.file.path) #' het.table<-hetTgen(vcf) #' #' @export -hetTgen <- function(vcf,info.type=c("AD","AD-tot","GT","GT-012","GT-AB","DP"),verbose=TRUE){ - if(inherits(vcf,"list")){vcf<-vcf$vcf} - if(inherits(vcf,"data.frame")){vcf<-data.table::data.table(vcf)} - if(any(nchar(vcf$ALT)>1)){ - warning("vcf file contains multi-allelic variants: \nonly bi-allelic SNPs allowed\nUse maf() to remove non-bi-allilic snps or drop minumum frequency alleles") +hetTgen <- function(vcf, info.type = c("AD", "AD-tot", "GT", "GT-012", "GT-AB", "DP"), verbose = TRUE, parallel = FALSE) { + if (inherits(vcf, "list")) { vcf <- vcf$vcf } + if (inherits(vcf, "data.frame")) { vcf <- data.table::data.table(vcf) } + if (any(nchar(vcf$ALT) > 1)) { + warning("vcf file contains multi-allelic variants: \nonly bi-allelic SNPs allowed\nUse maf() to remove non-bi-allilic snps or drop minimum frequency alleles") } - if(inherits(vcf,"list")){vcf<-vcf$vcf} - info.type<-match.arg(info.type) - itype<-substr(info.type,1,2) + info.type <- match.arg(info.type) + itype <- substr(info.type, 1, 2) - adn <- sapply(strsplit(unlist(vcf[,"FORMAT"], use.names = FALSE),":"), function(x)match(itype, x)) + adn <- sapply(strsplit(unlist(vcf[,"FORMAT"], use.names = FALSE), ":"), function(x) match(itype, x)) max_adn <- max(adn) + 1L ind <- cbind(seq_along(adn), adn) - xx<-data.frame(vcf[,-c(1:9)]) + xx <- data.frame(vcf[,-c(1:9)]) - if(info.type=="AD-tot"){ - h.table <- matrix(NA_integer_, nrow(xx), ncol(xx)) - if(verbose) message("generating total depth values") - for(i in seq_len(ncol(xx))){ - if(verbose){ - pb <- txtProgressBar(min = 0, max = ncol(xx), style = 3, width = 50, char = "=") - setTxtProgressBar(pb, i) - } + h.table <- matrix(NA, nrow(xx), ncol(xx)) + + process_column <- function(i) { + if (info.type == "AD-tot") { tmp <- stringr::str_split_fixed(xx[,i], ":", max_adn)[ind] tmp <- stringr::str_split_fixed(tmp, ",", 2L) - h.table[, i] <- as.numeric(tmp[,1]) + as.numeric(tmp[,2]) ## as.integer??? + as.numeric(tmp[,1]) + as.numeric(tmp[,2]) + } else { + tmp <- stringr::str_split_fixed(xx[,i], ":", max_adn)[ind] + if(info.type!="DP"){tmp[is.na(tmp) | tmp==".,."] <- "./."} + tmp } } - else { - h.table <- matrix(NA_character_, nrow(xx), ncol(xx)) - if(verbose) message("generating table") - for(i in seq_len(ncol(xx))){ - if(verbose){ - pb <- txtProgressBar(min = 0, max = ncol(xx), style = 3, width = 50, char = "=") + + if (verbose & parallel==FALSE) { + message("generating table") + pb <- txtProgressBar(min = 0, max = ncol(xx), style = 3, width = 50, char = "=") + } + + if (parallel) { + numCores <- detectCores() - 1 + cl <- makeCluster(numCores) + clusterExport(cl, varlist = c("xx", "max_adn", "ind", "info.type", "process_column"), envir = environment()) + + results <- parLapply(cl, seq_len(ncol(xx)), function(i) { + res <- process_column(i) + res + }) + stopCluster(cl) + + h.table <- do.call(cbind, results) + } else { + for (i in seq_len(ncol(xx))) { + h.table[, i] <- process_column(i) + if (verbose) { setTxtProgressBar(pb, i) } - h.table[, i] <- stringr::str_split_fixed(xx[,i], ":", max_adn)[ind] } - if(info.type!="DP"){h.table[is.na(h.table) | h.table==".,."]<-"./."} } -if(verbose) {close(pb)} - if(info.type=="GT-012"){ - h.table[h.table=="0/0"]<-0 - h.table[h.table=="1/1"]<-1 - h.table[h.table=="1/0" | h.table=="0/1"] <- 2 - h.table[h.table=="./."| h.table=="."]<-NA + + if (verbose) { + close(pb) + } + if (info.type == "GT-012") { + h.table[h.table == "0/0"] <- 0 + h.table[h.table == "1/1"] <- 1 + h.table[h.table == "1/0" | h.table == "0/1"] <- 2 + h.table[h.table == "./." | h.table == "."] <- NA } - if(info.type=="GT-AB"){ - h.table[h.table=="0/0"]<-"AA" - h.table[h.table=="1/1"]<-"BB" - h.table[h.table=="1/0" | h.table=="0/1"] <- "AB" - h.table[h.table=="./."| h.table=="."]<- -9 + if (info.type == "GT-AB") { + h.table[h.table == "0/0"] <- "AA" + h.table[h.table == "1/1"] <- "BB" + h.table[h.table == "1/0" | h.table == "0/1"] <- "AB" + h.table[h.table == "./." | h.table == "."] <- -9 } - if(info.type=="AD" ){ - h.table[h.table=="./." | h.table=="." | is.na(h.table)]<-"0,0" + if (info.type == "AD") { + h.table[h.table == "./." | h.table == "." | is.na(h.table)] <- "0,0" } - if(info.type=="DP"){ - h.table[is.character(h.table)]<-0 - h.table[is.na(h.table)]<-0 + if (info.type == "DP") { + h.table[is.character(h.table)] <- 0 + h.table[is.na(h.table)] <- 0 } - het.table<-as.data.frame(cbind(vcf[,c(1:3,5)],h.table)) - colnames(het.table)<-c("CHROM",colnames(vcf)[c(2,3,5,10:ncol(vcf))]) + + het.table <- as.data.frame(cbind(vcf[, c(1:3, 5)], h.table)) + colnames(het.table) <- c("CHROM", colnames(vcf)[c(2, 3, 5, 10:ncol(vcf))]) return(het.table) } + #' Get missingness of individuals in raw vcf #' #' A function to get the percentage of missing data of snps per SNP and per @@ -321,11 +332,13 @@ if(verbose) {close(pb)} #' @param plot logical. Whether to plot the missingness density with ninety #' five percent quantile #' @param verbose logical. Whether to show progress +#' @param parallel logical. whether to parallelize the process #' #' #' @author Piyal Karunarathne #' @importFrom graphics abline polygon #' @importFrom stats density +#' @importFrom parallel parApply detectCores parLapply stopCluster makeCluster #' #' @returns #' Returns a data frame of allele depth or genotypes @@ -336,23 +349,54 @@ if(verbose) {close(pb)} #' missing<-get.miss(vcf,plot=TRUE) #' #' @export -get.miss<-function(data,type=c("samples","snps"),plot=TRUE,verbose=TRUE){ +get.miss<-function(data,type=c("samples","snps"),plot=TRUE,verbose=TRUE,parallel=FALSE){ + if(parallel){numCores<-detectCores()-1 + cl<-makeCluster(numCores)} + if(inherits(data,"list")){ vcf<-data$vcf - ndat<-hetTgen(vcf,"GT",verbose=verbose) + ndat<-hetTgen(vcf,"GT",verbose=verbose,parallel = parallel) } else {ndat<-data} type<-match.arg(type,several.ok = T) if(any(type=="samples")){ - ll<-t(apply(ndat[,-c(1:4)],2,function(x){ - cbind(sum(x=="./." | is.na(x) | x=="0,0" | x==".,."),sum(x=="./." | is.na(x) | x=="0,0" | x==".,.")/length(x)) - })) + if(parallel){ + ll<-t(parApply(cl=cl,ndat[,-c(1:4)],2,function(x){ + cbind(sum(x=="./." | is.na(x) | x=="0,0" | x==".,."),sum(x=="./." | is.na(x) | x=="0,0" | x==".,.")/length(x)) + })) + } else { + if(verbose){ + message("assessing % missing samples") + ll<-t(apply_pb(ndat[,-c(1:4)],2,function(x){ + cbind(sum(x=="./." | is.na(x) | x=="0,0" | x==".,."),sum(x=="./." | is.na(x) | x=="0,0" | x==".,.")/length(x)) + })) + } else { + ll<-t(apply(ndat[,-c(1:4)],2,function(x){ + cbind(sum(x=="./." | is.na(x) | x=="0,0" | x==".,."),sum(x=="./." | is.na(x) | x=="0,0" | x==".,.")/length(x)) + })) + } + } + ll<-data.frame(indiv=colnames(ndat)[-c(1:4)],n_miss=ll[,1],f_miss=ll[,2]) rownames(ll)<-NULL } if(any(type=="snps")){ - L<-apply(ndat[,-c(1:4)],1,function(x){ - cbind(sum(x=="./." | is.na(x) | x=="0,0" | x==".,."),sum(x=="./." | is.na(x) | x=="0,0" | x==".,.")/length(x)) - }) + if(parallel){ + L<-parApply(cl=cl,ndat[,-c(1:4)],1,function(x){ + cbind(sum(x=="./." | is.na(x) | x=="0,0" | x==".,."),sum(x=="./." | is.na(x) | x=="0,0" | x==".,.")/length(x)) + }) + } else { + if(verbose){ + message("assessing % missing sites") + L<-apply_pb(ndat[,-c(1:4)],1,function(x){ + cbind(sum(x=="./." | is.na(x) | x=="0,0" | x==".,."),sum(x=="./." | is.na(x) | x=="0,0" | x==".,.")/length(x)) + }) + } else { + L<-apply(ndat[,-c(1:4)],1,function(x){ + cbind(sum(x=="./." | is.na(x) | x=="0,0" | x==".,."),sum(x=="./." | is.na(x) | x=="0,0" | x==".,.")/length(x)) + }) + } + + } if(is.list(L)){ L<-do.call(rbind,L) } else { L<-t(L)} @@ -386,10 +430,9 @@ get.miss<-function(data,type=c("samples","snps"),plot=TRUE,verbose=TRUE){ if(!exists("ll")){ll<-NULL} if(!exists("L")){L<-NULL} return(list(perSample=ll,perSNP=L)) + if(parallel){stopCluster(cl)} } - - #' Format genotype for BayEnv and BayPass #' #' This function generates necessary genotype count formats for BayEnv and @@ -403,7 +446,8 @@ get.miss<-function(data,type=c("samples","snps"),plot=TRUE,verbose=TRUE){ #' @param format character. output format i.e., for BayPass or BayEnv #' @param snp.subset numerical. number of randomly selected subsets of SNPs. #' \code{default = NULL} -#' subset +#' @param parallel logical. whether to parallelize the process +#' @importFrom parallel parApply detectCores parLapply stopCluster makeCluster #' #' @return Returns a list with formatted genotype data: \code{$bayenv} - snps #' in horizontal format - for BayEnv (two lines per snp); \code{$baypass} - vertical format - for BayPass @@ -419,7 +463,10 @@ get.miss<-function(data,type=c("samples","snps"),plot=TRUE,verbose=TRUE){ #' GT<-gt.format(het.table,info)} #' #' @export -gt.format <- function(gt,info,format=c("benv","bpass"),snp.subset=NULL) { +gt.format <- function(gt,info,format=c("benv","bpass"),snp.subset=NULL,parallel=FALSE) { + if(parallel){numCores<-detectCores()-1 + cl<-makeCluster(numCores)} + if(is.character(gt)){ gt <-as.data.frame(fread(gt)) gts <-gt[,-c(1,2)] @@ -446,30 +493,57 @@ gt.format <- function(gt,info,format=c("benv","bpass"),snp.subset=NULL) { if(any(format=="benv")){ message("Formating BayEnv") - ppe<-lapply_pb(lgt,function(X){ - out<-apply(X,2,function(x){zero<-sum((unlist(stringr::str_split(x,"/||")))==0) - one<-sum((unlist(stringr::str_split(x,"/||")))==1) - ot<-as.data.frame(c(zero,one),col.names=F) - return(ot)},simplify = F) - out<-do.call(rbind,out) - colnames(out)<-NULL - return(out) - }) + + if(parallel){ + ppe<-parLapply(cl=cl,lgt,function(X){ + out<-apply(X,2,function(x){zero<-sum((unlist(stringr::str_split(x,"/||")))==0) + one<-sum((unlist(stringr::str_split(x,"/||")))==1) + ot<-as.data.frame(c(zero,one),col.names=F) + return(ot)},simplify = F) + out<-do.call(rbind,out) + colnames(out)<-NULL + return(out) + }) + } else { + ppe<-lapply_pb(lgt,function(X){ + out<-apply(X,2,function(x){zero<-sum((unlist(stringr::str_split(x,"/||")))==0) + one<-sum((unlist(stringr::str_split(x,"/||")))==1) + ot<-as.data.frame(c(zero,one),col.names=F) + return(ot)},simplify = F) + out<-do.call(rbind,out) + colnames(out)<-NULL + return(out) + }) + } + ppe<-do.call(cbind,ppe) - rownames(ppe)<-paste0(paste0(gt[,1],".",gt[,2]),"~",rep(c(1,2),nrow(gt))) + #rownames(ppe)<-paste0(paste0(gt[,1],".",gt[,2]),"~",rep(c(1,2),nrow(gt))) } else {ppe<-NULL} if(any(format=="bpass")){ message("Formating BayPass") - ppp<-lapply_pb(lgt,function(X){ - out<-apply(X,2,function(x){zero<-sum((unlist(stringr::str_split(x,"/||")))==0) - one<-sum((unlist(stringr::str_split(x,"/||")))==1) - ot<-c(zero,one) - return(ot)},simplify = F) - out<-do.call(rbind,out) - colnames(out)<-NULL - return(out) - }) + + if(parallel){ + ppp<-parLapply(cl=cl,lgt,function(X){ + out<-apply(X,2,function(x){zero<-sum((unlist(stringr::str_split(x,"/||")))==0) + one<-sum((unlist(stringr::str_split(x,"/||")))==1) + ot<-c(zero,one) + return(ot)},simplify = F) + out<-do.call(rbind,out) + colnames(out)<-NULL + return(out) + }) + } else { + ppp<-lapply_pb(lgt,function(X){ + out<-apply(X,2,function(x){zero<-sum((unlist(stringr::str_split(x,"/||")))==0) + one<-sum((unlist(stringr::str_split(x,"/||")))==1) + ot<-c(zero,one) + return(ot)},simplify = F) + out<-do.call(rbind,out) + colnames(out)<-NULL + return(out) + }) + } ppp<-do.call(cbind,ppp) colnames(ppp)<-paste0(rep(names(lgt),each=2),"~",rep(c(1,2),ncol(ppp)/2)) rownames(ppp)<-paste0(gt[,1],".",gt[,2]) @@ -481,6 +555,7 @@ gt.format <- function(gt,info,format=c("benv","bpass"),snp.subset=NULL) { } else { chu.p <- NULL} } else {ppp<-NULL} return(list(baypass=ppp,bayenv=ppe,sub.bp=chu.p,pop=as.character(pp))) + if(parallel){stopCluster(cl)} } @@ -500,6 +575,9 @@ gt.format <- function(gt,info,format=c("benv","bpass"),snp.subset=NULL) { #' @param odd.correct logical, to correct for odd number anomalies in AD values. #' default \code{TRUE} #' @param verbose logical. show progress. Default \code{TRUE} +#' @param parallel logical. whether to parallelize the process +#' +#' @importFrom parallel parApply detectCores parLapply stopCluster makeCluster #' #' @return Returns the coverage corrected allele depth table similar to the #' output of \code{hetTgen} @@ -510,9 +588,12 @@ gt.format <- function(gt,info,format=c("benv","bpass"),snp.subset=NULL) { #' \dontrun{adc<-ad.correct(ADtable)} #' #' @export -ad.correct<-function(het.table,gt.table=NULL,odd.correct=TRUE,verbose=TRUE){ +ad.correct<-function(het.table,gt.table=NULL,odd.correct=TRUE,verbose=TRUE,parallel=FALSE){ + if(parallel){numCores <- detectCores() - 1 + cl <- makeCluster(numCores)} + if(!is.null(gt.table)){ - if(verbose){ + if(verbose & parallel==FALSE){ message("correcting genotype mis-classification") Nw.ad<-lapply_pb(5:ncol(het.table),function(n){ X<-het.table[,n] @@ -528,18 +609,18 @@ ad.correct<-function(het.table,gt.table=NULL,odd.correct=TRUE,verbose=TRUE){ Nw.ad<-do.call(cbind,Nw.ad) Nw.ad<-data.frame(het.table[,1:4],Nw.ad) colnames(Nw.ad)<-colnames(het.table) - } else { - Nw.ad<-lapply(5:ncol(het.table),function(n){ + } else if (parallel) { + Nw.ad<-parLapply(cl=cl,5:ncol(het.table),function(n,het.table,gt.table){ X<-het.table[,n] x<-gt.table[,n] Y<-data.frame(do.call(cbind,data.table::tstrsplit(X,","))) y<-which(x=="0/0" & Y$X2>0) - rr<-range(Y$X2[y])#range of depth in mis classified snps + rr<-range(Y$X2[y])#range of depth in miss classified snps Y[y,]<-0 - ll<-length(y)#number of mis classifications + ll<-length(y)#number of miss classifications out<-paste0(Y$X1,",",Y$X2) return(out) - }) + },het.table=het.table,gt.table=gt.table) Nw.ad<-do.call(cbind,Nw.ad) Nw.ad<-data.frame(het.table[,1:4],Nw.ad) colnames(Nw.ad)<-colnames(het.table) @@ -549,7 +630,7 @@ ad.correct<-function(het.table,gt.table=NULL,odd.correct=TRUE,verbose=TRUE){ } X<-data.frame(het.table[,-c(1:4)]) if(odd.correct){ - if(verbose){ + if(verbose & parallel==FALSE){ message("correcting odd number anomalies") vv<-apply_pb(X,2,function(sam){ sam<-unname(unlist(sam)) @@ -566,8 +647,8 @@ ad.correct<-function(het.table,gt.table=NULL,odd.correct=TRUE,verbose=TRUE){ if(inherits(vv,"list")){ vv<-do.call(cbind,vv) } - } else { - vv<-apply(X,2,function(sam){ + } else if (parallel) { + vv<-parApply(cl,X,2,function(sam){ dl<-lapply(sam,function(y){ l<-as.numeric(unlist(strsplit(y,","))) if((sum(l,na.rm=TRUE)%%2)!=0){ @@ -583,6 +664,6 @@ ad.correct<-function(het.table,gt.table=NULL,odd.correct=TRUE,verbose=TRUE){ } return(cbind(het.table[,1:4],vv)) } else { return(het.table)} + if(parallel){stopCluster(cl)} } - diff --git a/R/statistics.R b/R/statistics.R index a789df7..823c321 100644 --- a/R/statistics.R +++ b/R/statistics.R @@ -1,79 +1,4 @@ ## VCF statistics ### -# old gt2 function -gt2_old<-function(x,gg,freq){two<-gg[,x[1]];one<-gg[,x[2]] -vp<-NULL -if(x[1]==x[2]){ - for(i in seq_along(one)){ - vp[i]<-((one[i]*one[i])-((1+(2*freq[i]))*one[i])+(2*freq[i]*freq[i]))/(2*freq[i]*(1-freq[i])) - } - vp<-na.omit(vp) - Ajk<-((sum(vp))/length(vp))+1 -} else { - for(i in seq_along(one)){ - vp[i]<-((one[i]-(2*freq[i]))*(two[i]-(2*freq[i])))/(2*freq[i]*(1-freq[i])) - } - vp<-na.omit(vp) - Ajk<-sum(vp)/length(vp) -} -V<-c(colnames(gg)[x[2]],colnames(gg)[x[1]],Ajk) -return(V)} - -# old relatedness function -relatedness_old<-function(vcf,plot=TRUE,threshold=0.5,verbose=TRUE){ - if(!inherits(vcf,"list")) { - if(any(colnames(vcf)=="REF")){gtt<-hetTgen(vcf,"GT",verbose=verbose)} - else {gtt <-vcf} - } else { - vcf<-vcf$vcf - gtt<-hetTgen(vcf,"GT",verbose=verbose) - } - gt<-gtt[,-c(1:4)] - freq<-apply(gt,1,function(xx){aal<-unlist(strsplit(as.character(xx),"/")) - return(length(which(aal=="1"))/(length(which(aal=="0"))+length(which(aal=="1"))))}) - - gg<-apply(gt,2,function(x){XX<-rep(NA,length(x)) - XX[which(x=="0/0")]<-0 - XX[which(x=="1/1")]<-2 - XX[which(x=="1/0" | x=="0/1")]<-1 - XX}) - - comb<-expand.grid(1:ncol(gg),1:ncol(gg)) - if(verbose){ - message("assessing pairwise relatedness") - T2<-apply_pb(comb,1,gt2_old,gg=gg,freq=freq) - } else { - T2<-apply(comb,1,gt2_old,gg=gg,freq=freq) - } - T2<-data.frame(t(T2)) - T2[,3]<-as.numeric(T2[,3]) - colnames(T2)<-c("indv1","indv2","relatedness_Ajk") - if(plot){ - same = T2[T2[,1] == T2[,2], ] - diff = T2[T2[,1] != T2[,2], ] - outliers = diff[diff[,3] > threshold, ] - - opars<-par(no.readonly = TRUE) - on.exit(par(opars)) - - par(mfrow=c(3, 1)) - hist(same[,3], - main="Samples against themselves", - col="grey", - breaks=seq(-1000, 1000, by=0.05), - xlim=c(-1, 2)) - hist(diff[,3], - main="Samples among themselves", - col="grey", - breaks=seq(-100, 100, by=0.05), - xlim=c(-1, 2)) - hist(outliers[,3], - main="Outlier samples", - col="grey", - breaks=seq(-100, 100, by=0.05), - xlim=c(-1, 2)) - } - return(T2) -} #1. get heterozygosity per individual ## P(Homo) = F + (1-F)P(Homo by chance) @@ -85,17 +10,23 @@ relatedness_old<-function(vcf,plot=TRUE,threshold=0.5,verbose=TRUE){ ## O = NF + (1-F)E ## Which rearranges to give ## F = (O-E)/(N-E) -het.sity <- function(ind){ +het.sity1 <- function(ind){ ### calculate expected number of hets tab <- table(factor(ind, levels=c("0/0", "1/1", "1/0", "0/1", "./.", "."))) - O <- tab["0/0"] + tab["1/1"] N <- tab["0/0"] + tab["1/1"] + tab["0/1"] + tab["1/0"] p <- (2 * tab["0/0"] + tab["0/1"] + tab["1/0"])/ (2*N) q <- 1 - p - E <-(p^2+q^2)*N + E <-(p^2+q^2) + return(E) +} + +het.sity2 <- function(ind,eh){ + tab <- table(factor(ind, levels=c("0/0", "1/1", "1/0", "0/1", "./.", "."))) + O <- tab["0/0"] + tab["1/1"] + N <- tab["0/0"] + tab["1/1"] + tab["0/1"] + tab["1/0"] + E <- sum(eh[which(ind == "0/0" | ind == "1/1" | ind == "1/0" | ind == "0/1" )]) FF <-(O-E)/(N-E) return(c(O,E,N,FF)) } - #2 relatedness (according to Yang et al. 2010 equation no. 6 in the paper) #1. get alt allele freq for all snps #2. get genotype at each snp per individual gt=genotype1+genotype2 [1/0] >> aa=0,Aa=1, AA=2 @@ -150,7 +81,9 @@ gt2 <- function(x,gg,freq){ #' @param pops character. A list of population names with the same length and #' order as the number of samples in the vcf #' @param verbose logical. Show progress +#' @param parallel logical. Parallelize the process #' @importFrom graphics boxplot +#' @importFrom parallel parApply detectCores parLapply stopCluster makeCluster #' @return Returns a data frame of expected \dQuote{E(Hom)} and observed #' \dQuote{O(Hom)} homozygotes with their inbreeding coefficients. #' @@ -163,7 +96,7 @@ gt2 <- function(x,gg,freq){ #' hzygots<-h.zygosity(vcf,plot=TRUE,pops=pp)} #' #' @export -h.zygosity<-function(vcf,plot=FALSE,pops=NA,verbose=TRUE){ +h.zygosity<-function(vcf,plot=FALSE,pops=NA,verbose=TRUE,parallel=FALSE){ if(inherits(vcf,"list")) { vcf<-vcf$vcf gtt<-hetTgen(vcf,"GT",verbose=verbose) @@ -171,11 +104,22 @@ h.zygosity<-function(vcf,plot=FALSE,pops=NA,verbose=TRUE){ if(any(colnames(vcf)=="REF")){gtt<-hetTgen(vcf,"GT",verbose=verbose)} else {gtt <-vcf} } - if(verbose){ - message("assessing per sample homozygosity") - hh<-t(apply_pb(gtt[,-c(1:4)],2,het.sity)) + if(parallel){ + numCores<-detectCores()-1 + cl<-makeCluster(numCores) + clusterExport(cl, c("het.sity1","het.sity2")) + eh<-unlist(t(parApply(cl=cl,gtt[,-c(1:4)],1,het.sity1))) + hh<-t(parApply(cl=cl,gtt[,-c(1:4)],2,het.sity2,eh=eh)) + stopCluster(cl) } else { - hh<-t(apply(gtt[,-c(1:4)],2,het.sity)) + if(verbose){ + message("assessing per sample homozygosity") + eh<-unlist(apply_pb(gtt[,-c(1:4)],1,het.sity1)) + hh<-t(apply_pb(gtt[,-c(1:4)],2,het.sity2,eh=eh)) + } else { + eh<-unlist(t(apply(gtt[,-c(1:4)],1,het.sity1))) + hh<-t(apply(gtt[,-c(1:4)],2,het.sity2,eh=eh)) + } } hh<-data.frame(rownames(hh),hh) colnames(hh)<-c("ind","O(Hom)","E(Hom)","total","Fis") @@ -208,7 +152,9 @@ h.zygosity<-function(vcf,plot=FALSE,pops=NA,verbose=TRUE){ #' @param threshold numerical. A value indicating to filter the individuals of #' relatedness among themselves. Default \code{0.5} (siblings) #' @param verbose logical. Show progress. +#' @param parallel logical. Parallelize the process #' @importFrom graphics hist +#' @importFrom parallel parApply detectCores parLapply stopCluster makeCluster #' #' @return #' A data frame of individuals and relatedness score \eqn{A_{jk}} @@ -230,7 +176,7 @@ h.zygosity<-function(vcf,plot=FALSE,pops=NA,verbose=TRUE){ #' relate<-relatedness(vcf) #' #' @export -relatedness<-function(vcf,plot=TRUE,threshold=0.5,verbose=TRUE){ +relatedness<-function(vcf,plot=TRUE,threshold=0.5,verbose=TRUE,parallel=FALSE){ if(inherits(vcf,"data.frame") & any(colnames(vcf)=="INDV1")){ T2<-vcf } else { @@ -239,7 +185,7 @@ relatedness<-function(vcf,plot=TRUE,threshold=0.5,verbose=TRUE){ else {gtt <-vcf} } else { vcf<-vcf$vcf - gtt<-hetTgen(vcf,"GT",verbose=verbose) + gtt<-hetTgen(vcf,"GT",verbose=verbose,parallel=parallel) } gt<-gtt[,-c(1:4)] freq<-apply(gt,1,function(xx){aal<-stringr::str_split(xx,"/",simplify = T) @@ -252,12 +198,19 @@ relatedness<-function(vcf,plot=TRUE,threshold=0.5,verbose=TRUE){ XX}) comb<-expand.grid(1:ncol(gg),1:ncol(gg)) - if(verbose){ - message("assessing pairwise relatedness") - T2<-apply_pb(comb,1,gt2,gg=gg,freq=freq) - } else { - T2<-apply(comb,1,gt2,gg=gg,freq=freq) - } + if(parallel){ + numCores<-detectCores()-1 + cl<-makeCluster(numCores) + T2<-parApply(cl=cl,comb,1,gt2,gg=gg,freq=freq) + stopCluster(cl) + } else { + if(verbose){ + message("assessing pairwise relatedness") + T2<-apply_pb(comb,1,gt2,gg=gg,freq=freq) + } else { + T2<-apply(comb,1,gt2,gg=gg,freq=freq) + } + } T2<-data.frame(t(T2)) T2[,3]<-as.numeric(T2[,3]) colnames(T2)<-c("indv1","indv2","relatedness_Ajk") diff --git a/cran-comments.md b/cran-comments.md index 4217bf8..3b6f49e 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,12 +1,14 @@ ## R CMD check results -This is the second update of the package. Several changes have been made to two major functions and new updates have been added. They're as follows: +This is the third update of the package. Several changes have been made to two major functions and new updates have been added. They're as follows: + +1. parallelization enabled with parallel package +2. dupValidate function revised +3. per site Fis added to deviant detection +4. vstPermutation function added +5. maf modified to remove multi-allelic sites +5. FIT correction added -1. relatedness function optimized -2. bugs related to TMM normalization fixed in cpm.normal function -3. paper on the rCNV package was published and the citation is added to the package -4. web page updated -5. small bugs fixed in the sig.het function On check: diff --git a/man/ad.correct.Rd b/man/ad.correct.Rd index e046508..67f6375 100644 --- a/man/ad.correct.Rd +++ b/man/ad.correct.Rd @@ -4,7 +4,13 @@ \alias{ad.correct} \title{Correct allele depth values} \usage{ -ad.correct(het.table, gt.table = NULL, odd.correct = TRUE, verbose = TRUE) +ad.correct( + het.table, + gt.table = NULL, + odd.correct = TRUE, + verbose = TRUE, + parallel = FALSE +) } \arguments{ \item{het.table}{allele depth table generated from the function @@ -16,6 +22,8 @@ ad.correct(het.table, gt.table = NULL, odd.correct = TRUE, verbose = TRUE) default \code{TRUE}} \item{verbose}{logical. show progress. Default \code{TRUE}} + +\item{parallel}{logical. whether to parallelize the process} } \value{ Returns the coverage corrected allele depth table similar to the diff --git a/man/allele.info.Rd b/man/allele.info.Rd index 02036a4..e54c59a 100644 --- a/man/allele.info.Rd +++ b/man/allele.info.Rd @@ -7,6 +7,7 @@ allele.info( X, x.norm = NULL, + Fis, method = c("MedR", "QN", "pca", "TMM", "TMMex"), logratioTrim = 0.3, sumTrim = 0.05, @@ -14,6 +15,7 @@ allele.info( Acutoff = -1e+10, plot.allele.cov = TRUE, verbose = TRUE, + parallel = FALSE, ... ) } @@ -24,6 +26,8 @@ allele.info( \item{x.norm}{a data frame of normalized allele coverage, output of \code{cpm.normal}. If not provided, calculated using \code{X}.} +\item{Fis}{numeric. Inbreeding coefficient calculated using \code{h.zygosity()} function} + \item{method}{character. method to be used for normalization (see \code{cpm.normal} details). Default \code{TMM}} @@ -43,6 +47,8 @@ coverage in homozygotes and heterozygotes} \item{verbose}{logical, whether to print progress} +\item{parallel}{logical. whether to parallelize the process} + \item{\dots}{further arguments to be passed to \code{plot}} } \value{ diff --git a/man/cnv.Rd b/man/cnv.Rd index dacacc1..36e6f27 100644 --- a/man/cnv.Rd +++ b/man/cnv.Rd @@ -25,11 +25,7 @@ See details} \item{filter}{character. Type of filter to be used for filtering CNVs. default \code{kmeans}. See details.} -\item{WGS}{logical. test parameter. See details -\code{WGS} is a test parameter to include or exclude coefficient of variance -(cv) in kmeans. For data sets with more homogeneous depth distribution, -excluding cv improves CNV detection. If you're not certain about this, use -\code{TRUE} which is the default.} +\item{WGS}{logical. test parameter. See details} \item{ft.threshold}{confidence interval for filtering \code{default = 0.05}} @@ -68,6 +64,11 @@ clustering of the provided \code{test}s should be used in filtering CNVs. The intersection uses threshold values for filtering and kmeans use unsupervised clustering. Kmeans clustering is recommended if one is uncertain about the threshold values. + +\code{WGS} is a test parameter to include or exclude coefficient of variance +(cv) in kmeans. For data sets with more homogeneous depth distribution, +excluding cv improves CNV detection. If you're not certain about this, use +\code{TRUE} which is the default. } \examples{ \dontrun{data(alleleINF) @@ -75,5 +76,5 @@ DD<-cnv(alleleINF)} } \author{ -Piyal Karunarathne +Piyal Karunarathne Qiujie Zhou } diff --git a/man/dupGet.Rd b/man/dupGet.Rd index 836be68..4dfd466 100644 --- a/man/dupGet.Rd +++ b/man/dupGet.Rd @@ -6,6 +6,7 @@ \usage{ dupGet( data, + Fis, test = c("z.het", "z.05", "z.all", "chi.het", "chi.05", "chi.all"), intersection = FALSE, method = c("fisher", "chi.sq"), @@ -17,6 +18,8 @@ dupGet( \arguments{ \item{data}{data frame of the output of \code{allele.info}} +\item{Fis}{numeric. Inbreeding coefficient calculated using \code{h.zygosity()} function} + \item{test}{character. type of test to be used for significance. See details} \item{intersection}{logical, whether to use the intersection of the methods @@ -61,9 +64,9 @@ probe bias p=0.5 (\code{z.05, chi.05}) } \examples{ \dontrun{data(alleleINF) -DD<-dupGet(alleleINF)} +DD<-dupGet(alleleINF,Fis=0.1,test=c("z.05","chi.05"))} } \author{ -Piyal Karunarathne +Piyal Karunarathne Qiujie Zhou } diff --git a/man/get.miss.Rd b/man/get.miss.Rd index 11e29e1..cc53b82 100644 --- a/man/get.miss.Rd +++ b/man/get.miss.Rd @@ -4,7 +4,13 @@ \alias{get.miss} \title{Get missingness of individuals in raw vcf} \usage{ -get.miss(data, type = c("samples", "snps"), plot = TRUE, verbose = TRUE) +get.miss( + data, + type = c("samples", "snps"), + plot = TRUE, + verbose = TRUE, + parallel = FALSE +) } \arguments{ \item{data}{a list containing imported vcf file using \code{readVCF} or @@ -17,6 +23,8 @@ genotype table generated using \code{hetTgen}} five percent quantile} \item{verbose}{logical. Whether to show progress} + +\item{parallel}{logical. whether to parallelize the process} } \value{ Returns a data frame of allele depth or genotypes diff --git a/man/gt.format.Rd b/man/gt.format.Rd index 6a4a92d..70ba1ba 100644 --- a/man/gt.format.Rd +++ b/man/gt.format.Rd @@ -4,7 +4,13 @@ \alias{gt.format} \title{Format genotype for BayEnv and BayPass} \usage{ -gt.format(gt, info, format = c("benv", "bpass"), snp.subset = NULL) +gt.format( + gt, + info, + format = c("benv", "bpass"), + snp.subset = NULL, + parallel = FALSE +) } \arguments{ \item{gt}{multi-vector. an imported data.frame of genotypes or genotype @@ -17,8 +23,9 @@ It must have \dQuote{sample} and \dQuote{population} columns} \item{format}{character. output format i.e., for BayPass or BayEnv} \item{snp.subset}{numerical. number of randomly selected subsets of SNPs. -\code{default = NULL} -subset} +\code{default = NULL}} + +\item{parallel}{logical. whether to parallelize the process} } \value{ Returns a list with formatted genotype data: \code{$bayenv} - snps diff --git a/man/h.zygosity.Rd b/man/h.zygosity.Rd index 6477f08..2dea7e6 100644 --- a/man/h.zygosity.Rd +++ b/man/h.zygosity.Rd @@ -4,7 +4,7 @@ \alias{h.zygosity} \title{Determine per sample heterozygosity and inbreeding coefficient} \usage{ -h.zygosity(vcf, plot = FALSE, pops = NA, verbose = TRUE) +h.zygosity(vcf, plot = FALSE, pops = NA, verbose = TRUE, parallel = FALSE) } \arguments{ \item{vcf}{an imported vcf file in in a list using @@ -18,6 +18,8 @@ for populations. A list of populations must be provided} order as the number of samples in the vcf} \item{verbose}{logical. Show progress} + +\item{parallel}{logical. Parallelize the process} } \value{ Returns a data frame of expected \dQuote{E(Hom)} and observed diff --git a/man/hetTgen.Rd b/man/hetTgen.Rd index 401c86c..79d975b 100644 --- a/man/hetTgen.Rd +++ b/man/hetTgen.Rd @@ -7,7 +7,8 @@ hetTgen( vcf, info.type = c("AD", "AD-tot", "GT", "GT-012", "GT-AB", "DP"), - verbose = TRUE + verbose = TRUE, + parallel = FALSE ) } \arguments{ @@ -19,6 +20,8 @@ allele depth, \code{DP}=unfiltered depth (sum), \code{GT}: genotype, Default \code{AD}, See details.} \item{verbose}{logical. whether to show the progress of the analysis} + +\item{parallel}{logical. whether to parallelize the process} } \value{ Returns a data frame of allele depth, genotype of SNPs for all the diff --git a/man/maf.Rd b/man/maf.Rd index 119d87c..16779da 100644 --- a/man/maf.Rd +++ b/man/maf.Rd @@ -4,20 +4,18 @@ \alias{maf} \title{Remove MAF allele} \usage{ -maf(h.table, drop.multi = FALSE, AD = TRUE, verbose = TRUE) +maf(h.table, AD = TRUE, verbose = TRUE, parallel = FALSE) } \arguments{ \item{h.table}{allele depth table generated from the function \code{hetTgen}} -\item{drop.multi}{logical. If TRUE, all the sites with more than two allele will be -dropped. If FALSE \code{default}, the the two alleles with the highest allele -frequencies will be retained. See details for more information.} - -\item{AD}{logical. If TRUE, an allele depth table similar to \code{hetTgen} -output will be returned; If \code{FALSE}, individual AD values per SNP will be +\item{AD}{logical. If TRUE a allele depth table similar to \code{hetTgen} +output will be returns; If \code{FALSE}, individual AD values per SNP will be returned in a list.} \item{verbose}{logical. Show progress} + +\item{parallel}{logical. whether to parallelize the process} } \value{ A data frame or a list of minimum allele frequency removed allele depth @@ -26,14 +24,6 @@ A data frame or a list of minimum allele frequency removed allele depth A function to remove the alleles with minimum allele frequency and keep only a bi-allelic matrix when loci are multi-allelic } -\details{ -This function either drops all sites with more than two alleles or keep alleles \code{drop.multi=TRUE} -with the highest allele frequencies in the case of multi-allelic sites \code{drop.multi=FALSE}. -Note that, in the latter case, all sites will be retained, essentially assuming that multi-allelic sites -are also bi-allelic by dropping the alleles with minimum frequencies. -This function can only be used on the output of the \code{hetTgen()}. If you need to remove the multi-allelic sites -from the vcf, use \code{rCNV:::non_bi_rm()} function. We recommend using other programs (e.g., vcftools, GATK, etc.) for removing multiallelic sites faster. -} \examples{ \dontrun{mf<-maf(ADtable)} diff --git a/man/relatedness.Rd b/man/relatedness.Rd index fbe1b27..031ccee 100644 --- a/man/relatedness.Rd +++ b/man/relatedness.Rd @@ -4,7 +4,13 @@ \alias{relatedness} \title{Determine pairwise relatedness} \usage{ -relatedness(vcf, plot = TRUE, threshold = 0.5, verbose = TRUE) +relatedness( + vcf, + plot = TRUE, + threshold = 0.5, + verbose = TRUE, + parallel = FALSE +) } \arguments{ \item{vcf}{an imported vcf file in a list using \code{readVCF} @@ -17,6 +23,8 @@ themselves, among themselves and outliers} relatedness among themselves. Default \code{0.5} (siblings)} \item{verbose}{logical. Show progress.} + +\item{parallel}{logical. Parallelize the process} } \value{ A data frame of individuals and relatedness score \eqn{A_{jk}} diff --git a/man/sig.hets.Rd b/man/sig.hets.Rd index 43eb52f..35de7f4 100644 --- a/man/sig.hets.Rd +++ b/man/sig.hets.Rd @@ -6,6 +6,7 @@ \usage{ sig.hets( a.info, + Fis, method = c("chi.sq", "fisher"), plot = TRUE, verbose = TRUE, @@ -16,6 +17,8 @@ sig.hets( \item{a.info}{allele info table generated from filtered vcfs using the function \code{allele.info} or allele depth table generated from \code{hetTgen}} +\item{Fis}{numeric. Inbreeding coefficient calculated using \code{h.zygosity()} function} + \item{method}{character. Method for testing significance. Fisher exact test (\code{fisher}) or Chi-square test (\code{chi.sq})} @@ -38,9 +41,9 @@ only on the excess of heterozygotes. \examples{ \dontrun{data(alleleINF) AI <- alleleINF -duplicates<-sig.hets(AI,plot=TRUE)} +duplicates<-sig.hets(AI,plot=TRUE,Fis=0.1)} } \author{ -Piyal Karunarathne, Pascal Milesi, Klaus Schliep +Piyal Karunarathne, Pascal Milesi, Klaus Schliep, Qiujie Zhou } diff --git a/vignettes/rCNV.Rmd b/vignettes/rCNV.Rmd index 5f78c24..ec8bf5a 100644 --- a/vignettes/rCNV.Rmd +++ b/vignettes/rCNV.Rmd @@ -34,7 +34,8 @@ This tutorial provides a complete guide to detecting CNVs from SNPs data using r All the data used in this tutorial is available either withing the package or from the GitHub repository at . -We provide here a complete workflow of detecting CNVs from SNPs, starting with raw (unfiltered) VCF as in the number 1 of the workflow chart. However, if one believes that they have filtered VCFs for the parameters we highlight (see Fig.1 and sub-sections 1.2 and 1.3), they can start from the number 2 in the workflow shown above. +We provide here a complete workflow of detecting CNVs from SNPs, starting with raw (unfiltered) VCF as in the number 1 of the workflow chart. However, if one believes that they have filtered VCFs for the parameters we highlight (see Fig.1 and sub-sections 1.2 and 1.3), they can start from the number 2 in the workflow shown above. +*NOTE*: In the latest iteration of the rCNV package, the functions `sig.hets()` and `dupGet()` use use a sample based inbreeding coefficient *(Fis)* to improve the accuracy of the deviant detection. This *Fis* is calculated using the `h.zygosity()` function; see below for more information. ```{r, eval=FALSE} # Start by installing the package if you haven't already done so. @@ -211,7 +212,7 @@ The rCNV function `dupGet()` delivers easy to pick model selection to detect dev ```{r, eval=FALSE} # Run this code for a demonstration of the detection -dvs<-dupGet(alleleINF,test = c("z.05","chi.05")) +dvs<-dupGet(alleleINF,test = c("z.05","chi.05"),Fis=0.11) # z score and chi-square values given p=0.05 is used here because the data is RADseq generated and probe-biase is neglegible head(dvs) @@ -228,11 +229,15 @@ As such, `dupGet()` function implements three methods to flag the deviant allele | **NOTE: users can select the expected allele frequency (p: \code{test}) of the alternative allele based on the sequencing method and the probe bias therein. See our publication for more details on probe bias. ```{r,eval=FALSE} -deviants<-dupGet(alleleINF,test = c("z.all","chi.all"),plot=TRUE,verbose = TRUE) +# in the new version of the package dupGet function requires Fis value for a more accurate detection of the deviants +# this is obtained by using the h.zygosity() function as below +hz<-h.zygosity(parrot,verbose = FALSE) #parrot is the filtered vcf file +Fis<-mean(hz$Fis,na.rm = TRUE) +deviants<-dupGet(alleleINF,Fis=Fis,test = c("z.all","chi.all"),plot=TRUE,verbose = TRUE) head(deviants) ``` ```{r,echo=FALSE} -deviants<-dupGet(alleleINF,plot=FALSE,verbose = FALSE) +deviants<-dupGet(alleleINF,Fis=Fis,plot=FALSE,verbose = FALSE) head(deviants) ```