tutorial updated

piyalkarum · Sep 16, 2024 · 8483bd7 · 8483bd7
1 parent 1515224
commit 8483bd7
Show file tree

Hide file tree

Showing 106 changed files with 11,727 additions and 1,730 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -8,18 +8,19 @@ Authors@R: c(person(given="Piyal",family="Karunarathne", email="piyalkarumail@ya
 Maintainer: Piyal Karunarathne <piyalkarumail@yahoo.com>
 Description: Functions in this package will import filtered variant call format (VCF) files of SNPs data and generate data sets to detect copy number variants, visualize them and do downstream analyses with copy number variants(e.g. Environmental association analyses).
 License: AGPL (>= 3)
-Imports: data.table, graphics, colorspace, R.utils, qgraph, stringr
+Imports: data.table, graphics, colorspace, R.utils, qgraph, stringr, Rcpp
+LinkingTo: Rcpp
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Depends: 
     R (>= 3.6.0)
 Suggests: 
     rmarkdown,
     knitr,
     testthat (>= 3.0.0),
     covr
-VignetteBuilder: knitr
+##VignetteBuilder: knitr
 Config/testthat/edition: 3
 Roxygen: list(markdown = TRUE)
 URL: https://piyalkarum.github.io/rCNV/, https://cran.r-project.org/package=rCNV

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,11 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+process_snp <- function(X, AD) {
+    .Call('_rCNV_process_snp', PACKAGE = 'rCNV', X, AD)
+}
+
+process_column1 <- function(column, info_type, max_adn, ind_adn) {
+    .Call('_rCNV_process_column1', PACKAGE = 'rCNV', column, info_type, max_adn, ind_adn)
+}
+
diff --git a/R/bias.det.R b/R/bias.det.R
@@ -104,8 +104,12 @@ get.pvals<-function(x,df,p.cal){
 #' }
 #'
 #' @examples
-#' \dontrun{data(ADtable)
-#' AI<-allele.info(ADtable,x.norm=ADnorm)}
+#' \dontrun{
+#' hz<-h.zygosity(vcf,verbose=FALSE)
+#' Fis<-mean(hz$Fis,na.rm = TRUE)
+#' data(ADtable)
+#' AI<-allele.info(ADtable,x.norm=ADnorm,Fis=0.11)
+#' }
 #'
 #' @export
 allele.info<-function(X,x.norm=NULL,Fis,method=c("MedR","QN","pca","TMM","TMMex"),logratioTrim = 0.3,sumTrim = 0.05,Weighting = TRUE,Acutoff = -1e+10,plot.allele.cov=TRUE,verbose = TRUE,parallel=FALSE,...){

diff --git a/R/cpp_calls.R b/R/cpp_calls.R
@@ -0,0 +1,13 @@
+# isquare<-function(x){
+#   .Call('_rCNV_square_cpp',x)
+# }
+#
+#
+# mply<-function(x,multiplier){
+#   .Call('_rCNV_multiply_cpp',x,multiplier)
+# }
+
+# process_maf<-function(X,AD=TRUE){
+#   .Call('process_snp',X,AD)
+# }
+
diff --git a/R/raw_process.R b/R/raw_process.R
@@ -149,7 +149,27 @@ gg<-function(x){
 #' \dontrun{mf<-maf(ADtable)}
 #'
 #' @export
-maf<-function(h.table,AD=TRUE,verbose=TRUE,parallel=FALSE){
+maf <- function(h.table, AD = TRUE, verbose = TRUE, parallel = FALSE) {
+  htab <- h.table[, -c(1:4)]
+  if (parallel) {
+    numCores <- parallel::detectCores() - 1
+    cl <- parallel::makeCluster(numCores)
+    glt <- parallel::parApply(cl = cl, htab, 1, function(X) { process_snp(X, AD) })
+    parallel::stopCluster(cl)
+  } else {
+    if (verbose) {
+      glt <- apply_pb(htab, 1, function(X) { process_snp(X, AD) })
+    } else {
+      glt <- apply(htab, 1, function(X) { process_snp(X, AD) })
+    }
+  }
+  glt <- data.frame(h.table[, 1:4], t(glt))
+  colnames(glt) <- colnames(h.table)
+  return(glt)
+}
+
+
+maf_0<-function(h.table,AD=TRUE,verbose=TRUE,parallel=FALSE){
   htab<-h.table[,-c(1:4)]
   if(parallel){
     numCores<-detectCores()-1
@@ -168,7 +188,7 @@ maf<-function(h.table,AD=TRUE,verbose=TRUE,parallel=FALSE){
         if(AD){return(paste0(gg[,1],",",gg[,2]))}else{return(gg)}
       })
     }else{
-      glt<-apply_pb(htab,1,function(X){
+      glt<-apply(htab,1,function(X){
         gg<-do.call(rbind,lapply(X,function(x){as.numeric(strsplit(x,",")[[1]])}))
         if(ncol(gg)>2)gg<-gg[,-which.min(colMeans(proportions(gg,1),na.rm=T))]
         if(AD){return(paste0(gg[,1],",",gg[,2]))}else{return(gg)}
@@ -253,7 +273,7 @@ hetTgen <- function(vcf, info.type = c("AD", "AD-tot", "GT", "GT-012", "GT-AB",
 
   h.table <- matrix(NA, nrow(xx), ncol(xx))
 
-  process_column <- function(i) {
+  process_column_0 <- function(i) {
     if (info.type == "AD-tot") {
       tmp <- stringr::str_split_fixed(xx[,i], ":", max_adn)[ind]
       tmp <- stringr::str_split_fixed(tmp, ",", 2L)
@@ -276,15 +296,15 @@ hetTgen <- function(vcf, info.type = c("AD", "AD-tot", "GT", "GT-012", "GT-AB",
     clusterExport(cl, varlist = c("xx", "max_adn", "ind", "info.type", "process_column"), envir = environment())
 
     results <- parLapply(cl, seq_len(ncol(xx)), function(i) {
-      res <- process_column(i)
+      res <- process_column_0(i)
       res
     })
     stopCluster(cl)
 
     h.table <- do.call(cbind, results)
   } else {
     for (i in seq_len(ncol(xx))) {
-      h.table[, i] <- process_column(i)
+      h.table[, i] <- process_column_0(i)
       if (verbose) {
         setTxtProgressBar(pb, i)
       }
@@ -320,6 +340,71 @@ hetTgen <- function(vcf, info.type = c("AD", "AD-tot", "GT", "GT-012", "GT-AB",
 }
 
 
+
+hetTgen_cpp <- function(vcf, info.type = c("AD", "AD-tot", "GT", "GT-012", "GT-AB", "DP"), verbose = TRUE, parallel = FALSE) {
+  if (inherits(vcf, "list")) { vcf <- vcf$vcf }
+  if (inherits(vcf, "data.frame")) { vcf <- data.table::data.table(vcf) }
+  if (any(nchar(vcf$ALT) > 1)) {
+    warning("vcf file contains multi-allelic variants: \nonly bi-allelic SNPs allowed\nUse maf() to remove non-bi-allilic snps or drop minimum frequency alleles")
+  }
+
+  info.type <- match.arg(info.type)
+  itype <- substr(info.type, 1, 2)
+
+  adn <- sapply(strsplit(unlist(vcf[,"FORMAT"], use.names = FALSE), ":"), function(x) match(itype, x))
+  max_adn <- max(adn) + 1L
+  ind <- data.frame(cbind(seq_along(adn), adn))
+  xx <- data.frame(vcf[,-c(1:9)])
+
+  if (verbose & parallel==FALSE) {
+    message("generating table")
+  }
+
+  if (parallel) {
+    numCores <- detectCores() - 1
+    cl <- makeCluster(numCores)
+    clusterExport(cl, varlist = c("xx", "max_adn", "ind", "info.type", "process_column"), envir = environment())
+
+    h.table <- parAapply(cl, xx,2,process_column1,info_type = info.type,max_adn = max_adn,ind_adn = ind$adn)
+    stopCluster(cl)
+  } else {
+    if(verbose){
+      h.table <- rCNV:::apply_pb(xx,2,process_column1,info_type = info.type,max_adn = max_adn,ind_adn = ind$adn)
+    } else {
+      h.table <- apply(xx,2,process_column1,info_type = info.type,max_adn = max_adn,ind_adn = ind$adn)
+    }
+  }
+
+  if (info.type == "GT-012") {
+    h.table[h.table == "0/0"] <- 0
+    h.table[h.table == "1/1"] <- 1
+    h.table[h.table == "1/0" | h.table == "0/1"] <- 2
+    h.table[h.table == "./." | h.table == "."] <- NA
+  }
+  if (info.type == "GT-AB") {
+    h.table[h.table == "0/0"] <- "AA"
+    h.table[h.table == "1/1"] <- "BB"
+    h.table[h.table == "1/0" | h.table == "0/1"] <- "AB"
+    h.table[h.table == "./." | h.table == "."] <- -9
+  }
+  if (info.type == "AD") {
+    h.table[h.table == "./." | h.table == "." | is.na(h.table)] <- "0,0"
+  }
+  if (info.type == "DP") {
+    h.table[is.character(h.table)] <- 0
+    h.table[is.na(h.table)] <- 0
+  }
+
+  het.table <- as.data.frame(cbind(vcf[, c(1:3, 5)], h.table))
+  colnames(het.table) <- c("CHROM", colnames(vcf)[c(2, 3, 5, 10:ncol(vcf))])
+  return(het.table)
+}
+
+
+
+
+
+
 #' Get missingness of individuals in raw vcf
 #'
 #' A function to get the percentage of missing data of snps per SNP and per

diff --git a/docs/404.html b/docs/404.html