first commit

veseshan · Jan 25, 2017 · deb0d88 · deb0d88
commit deb0d88
Show file tree

Hide file tree

Showing 11 changed files with 371 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,14 @@
+Package: seqDNAcopy
+Type: Package
+Title: Copy number data analysis using DNA sequencing data
+Version: 0.0.1
+Date: 2017-01-20
+Author: Venkatraman E. Seshan
+Maintainer: Venkatraman E. Seshan <seshanv@mskcc.org>
+Description: Implements the CBS algorithm for sequencing data.
+License: GPL (>= 2)
+biocViews: Microarray, CopyNumberVariation
+Depends: R (>= 2.10), pctGCdata (>= 0.2.0)
+Imports: Rcpp (>= 0.12.0), IRanges, DNAcopy, Rsamtools
+LinkingTo: Rcpp
+NeedsCompilation: yes
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,10 @@
+useDynLib(seqDNAcopy)
+import("stats")
+importFrom("utils", "data")
+importFrom("Rcpp", "evalCpp")
+importFrom("IRanges", "IRanges", "RangesList")
+importFrom("DNAcopy", "CNA", "segment")
+importFrom("pctGCdata", "getGCpct")
+importFrom("Rsamtools", "ScanBamParam", "scanBam", "scanBamFlag")
+#exportPattern("^[[:alpha:]]+")
+export("bams2counts", "seqsegment")
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,6 @@
+# This file was generated by Rcpp::compileAttributes
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+frags2counts <- function(nfmid, tfmid, rstart, rend) {
+    .Call('seqDNAcopy_frags2counts', PACKAGE = 'seqDNAcopy', nfmid, tfmid, rstart, rend)
+}
diff --git a/R/bams2counts.R b/R/bams2counts.R
@@ -0,0 +1,108 @@
+chromRange <- function(i, chr="") {
+  eval(parse(text=paste('RangesList("', chr, i,'"=IRanges(start=0, end=268435456L))',sep="")))
+}
+
+bam2fragments <- function(bamFile, X=FALSE, mapq=20) {
+    # get position, mate position and insert size (fragment length)
+    what <- c("pos","mpos","isize")
+    # get first mate from properly paired reads which pass QC
+    flag=scanBamFlag(isNotPassingQualityControls=FALSE, isPaired=TRUE, isFirstMateRead=TRUE, hasUnmappedMate=FALSE, isDuplicate=FALSE, isSecondaryAlignment=FALSE)
+    bam <- list()
+    # autosomes
+    for(i in 1:22) {
+        which <- chromRange(i)
+        param <- ScanBamParam(flag=flag, which = which, what = what, mapqFilter=mapq)
+        # scan the data
+        bam[[i]] <- scanBam(bamFile, param=param)[[1]]
+    }
+    if (X) {
+        # chromosome X
+        which <- chromRange("X")
+        param <- ScanBamParam(flag=flag, which = which, what = what)
+        # scan the data
+        bam[[23]] <- scanBam(bamFile, param=param)[[1]]
+    }
+    bam
+}
+
+fragments2dataframe <- function(nbam, tbam, iSizeLim=c(75,750), gbuild="hg19") {
+    # normal fragment midpoints
+    nfmid <- lapply(nbam, function(x, ll, ul) {
+        x$isize <- abs(x$isize)
+        sort((pmin(x$pos, x$mpos) + x$isize/2)[x$isize >= ll & x$isize <= ul])
+    }, iSizeLim[1], iSizeLim[2])
+    # tumor fragment midpoints
+    tfmid <- lapply(tbam, function(x, ll, ul) {
+        x$isize <- abs(x$isize)
+        sort((pmin(x$pos, x$mpos) + x$isize/2)[x$isize >= ll & x$isize <= ul])
+    }, iSizeLim[1], iSizeLim[2])
+    # bin the mid points into interval of size 100
+    binnedcounts <- list()
+    nchr <- length(nbam)
+    # load blacklisted regions data
+    data(hgBlacklist, package="seqDNAcopy",envir=environment())
+    blgbuild <- paste(gbuild, "bl", sep="")
+    gbl <- get(blgbuild, envir=environment())
+    for(i in 1:nchr) binnedcounts[[i]] <- frags2counts(nfmid[[i]], tfmid[[i]], gbl[[i]][,1], gbl[[i]][,2])
+    # total number of bins with nonzero count
+    nbins <- unlist(lapply(binnedcounts, function(x) {length(x$pos)}))
+    # convert to matrix
+    fcounts <- matrix(0, sum(nbins), 4)
+    colnames(fcounts) <- c("chrom","pos","normal","tumor")
+    fcounts[,1] <- rep(1:nchr, nbins)
+    for(i in 1:nchr) {
+        ii <- fcounts[,1]==i
+        fcounts[ii,2] <- binnedcounts[[i]]$pos
+        fcounts[ii,3] <- binnedcounts[[i]]$normal
+        fcounts[ii,4] <- binnedcounts[[i]]$tumor
+    }
+    # return data
+    as.data.frame(fcounts)
+}
+
+bams2counts <- function(nBamFile, tBamFile, GCcorrect=TRUE, gbuild="hg19", mapq=20, iSizeLim=c(75,750), X=FALSE) {
+    # normal bam read data ("pos","mpos","isize")
+    nbam <- bam2fragments(nBamFile, X, mapq)
+    # tumor bam read data ("pos","mpos","isize")
+    tbam <- bam2fragments(tBamFile, X, mapq)
+    # get the fragment count in bins centered every 100 bases
+    out <- fragments2dataframe(nbam, tbam, iSizeLim, gbuild)
+    if (GCcorrect) {
+        # gc percentage
+        gcpct <- rep(NA_real_, nrow(out))
+        # get GC percentages from pctGCdata package
+        # loop thru chromosomes
+        nchr <- max(out$chrom) # IMPACT doesn't have X so only 22
+        for (i in 1:nchr) {
+            ii <- which(out$chrom==i)
+            # allow for chromosomes with no SNPs i.e. not targeted
+            if (length(ii) > 0) {
+                gcpct[ii] <- getGCpct(i, out$pos[ii], gbuild)
+            }
+        }
+        # GC correction
+        tscl <- sum(out$normal)/sum(out$tumor)
+        # normal and tumor fragment counts by GC percent level
+        ncount <- tapply(out$normal, gcpct, sum)
+        tcount <- tapply(out$tumor, gcpct, sum)
+        # log-ratio of tumor to normal counts as function of GC percentages
+        # standardized by the ratio of library sizes
+        logtn <- log2(tcount*tscl) - log2(ncount)
+        # percent GC
+        pctgc <- as.numeric(names(logtn))
+        # weights for each GC percent level
+        wts <- log2(tcount+ncount)
+        # bins that have non-zero count for both tumor and normal
+        ii <- which(is.finite(logtn))
+        # gc bias estimated using loess
+        gcb <- loess(logtn[ii] ~ pctgc[ii], weights=wts[ii], span=0.25)
+        # GC corrected library scaled tumor counts
+        jj <- match(gcpct, gcb$x)
+        gcscl <- 2^{-gcb$fitted[jj]}
+        # gcscl is NA if a bin has NA for gcpct; make it 1
+        gcscl[is.na(gcscl)] <- 1
+        # scale tumor counts by gcscl
+        out$tumor <- gcscl*out$tumor
+    }
+    out
+}
diff --git a/R/seqsegment.R b/R/seqsegment.R
@@ -0,0 +1,27 @@
+seqsegment <-
+function(fcounts, sampleid="SeqSample", minBinCount=15, binSize=1000) {
+    # collapse the 100 base bins according to binsize
+    fcounts$bins <- fcounts$chrom + floor(fcounts$pos/binSize)*binSize/2^28
+    # data corresponding to the new bins
+    zz0 <- list()
+    zz0$chrom <- tapply(fcounts$chrom, fcounts$bins, function(x){x[1]})
+    zz0$pos <- tapply(fcounts$pos, fcounts$bins, function(x){x[1]})
+    zz0$pos <- binSize*(floor(zz0$pos/binSize) + 0.5)/1e6
+    zz0$normal <- tapply(fcounts$normal, fcounts$bins, sum)
+    zz0$tumor <- tapply(fcounts$tumor, fcounts$bins, sum)
+    zz0 <- as.data.frame(zz0)
+    # counts to segments
+    zz <- zz0[zz0[,3]>minBinCount,]
+    # scale the read counts
+    tscl <- sum(zz[,"normal"])/sum(zz[,"tumor"])
+    zchr <- zz[,"chrom"]
+    zpos <- zz[,"pos"]
+    # log-ratio
+    zlr <- log2(zz[,"tumor"]*tscl+1) - log2(zz[,"normal"]+1)
+    # weights
+    zwts <- log2(zz[,"normal"]+1-minBinCount)
+
+    zcna <- CNA(as.matrix(zlr), zchr, zpos, sampleid=sampleid)
+    zout <- segment(zcna, weights=zwts)
+    zout
+}
diff --git a/data/hgBlacklist.rda b/data/hgBlacklist.rda
diff --git a/man/hgBlacklist.Rd b/man/hgBlacklist.Rd
@@ -0,0 +1,31 @@
+\name{hgBlacklist}
+\alias{hgBlacklist}
+\alias{hg18bl}
+\alias{hg19bl}
+\alias{hg38bl}
+\title{Blacklisted regions for human genome}
+\description{
+  Regions in the genome that give sequencing artifacts as listed in
+  UCSC Genome browser track wgEncodeDacMapabilityConsensusExcludable
+}
+\usage{
+  data(hgBlacklist)
+}
+\format{
+  A list containing 25 matrices corresponding to chromosomes 1-22, X, Y &
+  MT in that order. Each row is an interval that is to be discarded.
+}
+\details{
+  The intervals for hg19 were obtained from UCSC Genome browser track
+  \url{https://genome.ucsc.edu/cgi-bin/hgFileUi?db=hg19&g=wgEncodeMapability}
+  wgEncodeDacMapabilityConsensusExcludable. The intervals were padded
+  by 100 bases on both ends and an extra row (2^28, 2^28) added to each
+  chromosome for programming convenience. The hg18 and hg38 data were
+  obtained using liftover \url{https://genome.ucsc.edu/cgi-bin/hgLiftOver}.
+
+  A fragment whose mid-point lies in an interval is discarded from analysis.
+}
+\references{
+  \url{https://sites.google.com/site/anshulkundaje/projects/blacklists}
+}
+\keyword{datasets}
diff --git a/man/seqDNAcopy-internal.Rd b/man/seqDNAcopy-internal.Rd
@@ -0,0 +1,22 @@
+\name{seqDNAcopy-internal}
+\alias{bam2fragments}
+\alias{bams2counts}
+\alias{chromRange}
+\alias{fragments2counts}
+\alias{fragments2dataframe}
+\alias{seqsegment}
+\title{Internal seqDNAcopy functions}
+\description{
+  Internal functions of package seqDNAcopy.
+}
+\usage{
+chromRange(i, chr="")
+bam2fragments(bamFile, X=FALSE, mapq=20)
+frags2counts(nfmid, tfmid, rstart, rend)
+fragments2dataframe(nbam, tbam, iSizeLim=c(75,750), gbuild="hg19")
+bams2counts(nBamFile, tBamFile, GCcorrect=TRUE, gbuild="hg19", mapq=20,
+              iSizeLim=c(75,750), X=FALSE)
+seqsegment(fcounts, sampleid="SeqSample", minBinCount=15, binSize=1000)
+}
+\details{These are not to be called directly by the user}
+\keyword{internal}
diff --git a/man/seqDNAcopy-package.Rd b/man/seqDNAcopy-package.Rd
@@ -0,0 +1,25 @@
+\name{seqDNAcopy-package}
+\alias{seqDNAcopy-package}
+\alias{seqDNAcopy}
+\docType{package}
+\title{
+  \packageTitle{seqDNAcopy}
+}
+\description{
+  \packageDescription{seqDNAcopy}
+}
+\details{
+  seqDNAcopy implements the CBS algorithm for sequencing data. It uses
+  coverage depth for tumor and normal data to compute log-ratio which is
+  first normalized for GC content and then segmented using segment
+  function from DNAcopy.
+}
+\author{
+\packageAuthor{seqDNAcopy}
+
+Maintainer: \packageMaintainer{seqDNAcopy}
+}
+\references{
+  Need to be added
+}
+\keyword{ package }
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -0,0 +1,21 @@
+// This file was generated by Rcpp::compileAttributes
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// frags2counts
+List frags2counts(NumericVector nfmid, NumericVector tfmid, NumericVector rstart, NumericVector rend);
+RcppExport SEXP seqDNAcopy_frags2counts(SEXP nfmidSEXP, SEXP tfmidSEXP, SEXP rstartSEXP, SEXP rendSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject __result;
+    Rcpp::RNGScope __rngScope;
+    Rcpp::traits::input_parameter< NumericVector >::type nfmid(nfmidSEXP);
+    Rcpp::traits::input_parameter< NumericVector >::type tfmid(tfmidSEXP);
+    Rcpp::traits::input_parameter< NumericVector >::type rstart(rstartSEXP);
+    Rcpp::traits::input_parameter< NumericVector >::type rend(rendSEXP);
+    __result = Rcpp::wrap(frags2counts(nfmid, tfmid, rstart, rend));
+    return __result;
+END_RCPP
+}
diff --git a/src/frags2counts.cpp b/src/frags2counts.cpp
@@ -0,0 +1,107 @@
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// [[Rcpp::export]]
+List frags2counts(NumericVector nfmid, NumericVector tfmid, NumericVector rstart, NumericVector rend) {
+
+  // normal fragments that are discarded
+  int n = nfmid.size();
+  NumericVector ndiscard(n);
+
+  int i = 0;                           // first blacklisted region
+  for(int j=0; j < n; ++j) {           // loop through fragments
+    if (nfmid[j] < rstart[i]) {        // if fragmid before region start
+      ndiscard[j] = 0;                 // keep fragment
+    } else {                           // else check fragmid
+      if (nfmid[j] <= rend[i]) {       // if fragmid within region
+	ndiscard[j] = 1;               // discard fragment
+      } else {                         // find the next region at or past it
+	while (nfmid[j] > rend[i]) {   // while fragmid exceeds region end
+	  i = i+1;                     // go to next region
+	}                              // region change complete
+	if (nfmid[j] < rstart[i]) {    // if fragmid before region start
+	  ndiscard[j] = 0;             // keep fragment
+	} else {                       // else discard it
+	  ndiscard[j] = 1;             // since we know nfmid[j] <= rend[i]
+	}
+      }
+    }
+  }
+
+  // tumor fragments that are discarded
+  n = tfmid.size();
+  NumericVector tdiscard(n);
+
+  i = 0;                               // first blacklisted region
+  for(int j=0; j < n; ++j) {           // loop through fragments
+    if (tfmid[j] < rstart[i]) {        // if fragmid before region start
+      tdiscard[j] = 0;                 // keep fragment
+    } else {                           // else check fragmid
+      if (tfmid[j] <= rend[i]) {       // if fragmid within region
+	tdiscard[j] = 1;               // discard fragment
+      } else {                         // find the next region at or past it
+	while (tfmid[j] > rend[i]) {   // while fragmid exceeds region end
+	  i = i+1;                     // go to next region
+	}                              // region change complete
+	if (tfmid[j] < rstart[i]) {    // if fragmid before region start
+	  tdiscard[j] = 0;             // keep fragment
+	} else {                       // else discard it
+	  tdiscard[j] = 1;             // since we know tfmid[j] <= rend[i]
+	}
+      }
+    }
+  }
+
+  // chromosome < 2^28 bases; so at most 2^28/100 = 2684355 bins
+  // use the valid fragments to create a count matrix
+
+  int ll;
+  NumericVector ncount(2684355);
+  NumericVector tcount(2684355);
+  NumericVector keep(2684355);      // bins with non-zero counts
+
+  for(int j=0; j < 2684355; ++j) {
+    ncount[j] = 0;                  // initialize normal count to 0
+    tcount[j] = 0;                  // initialize tumor count to 0
+    keep[j] = 0;                    // initialize keep to 0
+  }
+
+  n = nfmid.size();
+  for(int j=0; j < n; ++j) {        // loop through normal fragments
+    if (ndiscard[j] == 0) {
+      ll = floor(nfmid[j]/100);
+      ncount[ll] += 1;
+      keep[ll] = 1;
+    }
+  }
+
+  n = tfmid.size();
+  for(int j=0; j < n; ++j) {        // loop through tumor fragments
+    if (tdiscard[j] == 0) {
+      ll = floor(tfmid[j]/100);
+      tcount[ll] += 1;
+      keep[ll] = 1;
+    }
+  }
+
+  int nnzbins = 0;                      // non-zero bin counter
+  for(int j=0; j < 2684355; ++j) {
+    nnzbins += keep[j];
+  }
+
+  NumericVector pos(nnzbins);
+  NumericVector normal(nnzbins);
+  NumericVector tumor(nnzbins);
+  i = 0;
+  for(int j=0; j < 2684355; ++j) {
+    if (keep[j] == 1) {
+      pos[i] = 50 + 100*j;
+      normal[i] = ncount[j];
+      tumor[i] = tcount[j];
+      i += 1;
+    }
+  }
+
+  return List::create(_["pos"] = pos, _["normal"] = normal, _["tumor"] = tumor);
+}