Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
veseshan committed Jan 25, 2017
0 parents commit deb0d88
Show file tree
Hide file tree
Showing 11 changed files with 371 additions and 0 deletions.
14 changes: 14 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Package: seqDNAcopy
Type: Package
Title: Copy number data analysis using DNA sequencing data
Version: 0.0.1
Date: 2017-01-20
Author: Venkatraman E. Seshan
Maintainer: Venkatraman E. Seshan <seshanv@mskcc.org>
Description: Implements the CBS algorithm for sequencing data.
License: GPL (>= 2)
biocViews: Microarray, CopyNumberVariation
Depends: R (>= 2.10), pctGCdata (>= 0.2.0)
Imports: Rcpp (>= 0.12.0), IRanges, DNAcopy, Rsamtools
LinkingTo: Rcpp
NeedsCompilation: yes
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
useDynLib(seqDNAcopy)
import("stats")
importFrom("utils", "data")
importFrom("Rcpp", "evalCpp")
importFrom("IRanges", "IRanges", "RangesList")
importFrom("DNAcopy", "CNA", "segment")
importFrom("pctGCdata", "getGCpct")
importFrom("Rsamtools", "ScanBamParam", "scanBam", "scanBamFlag")
#exportPattern("^[[:alpha:]]+")
export("bams2counts", "seqsegment")
6 changes: 6 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

frags2counts <- function(nfmid, tfmid, rstart, rend) {
.Call('seqDNAcopy_frags2counts', PACKAGE = 'seqDNAcopy', nfmid, tfmid, rstart, rend)
}
108 changes: 108 additions & 0 deletions R/bams2counts.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
chromRange <- function(i, chr="") {
eval(parse(text=paste('RangesList("', chr, i,'"=IRanges(start=0, end=268435456L))',sep="")))
}

bam2fragments <- function(bamFile, X=FALSE, mapq=20) {
# get position, mate position and insert size (fragment length)
what <- c("pos","mpos","isize")
# get first mate from properly paired reads which pass QC
flag=scanBamFlag(isNotPassingQualityControls=FALSE, isPaired=TRUE, isFirstMateRead=TRUE, hasUnmappedMate=FALSE, isDuplicate=FALSE, isSecondaryAlignment=FALSE)
bam <- list()
# autosomes
for(i in 1:22) {
which <- chromRange(i)
param <- ScanBamParam(flag=flag, which = which, what = what, mapqFilter=mapq)
# scan the data
bam[[i]] <- scanBam(bamFile, param=param)[[1]]
}
if (X) {
# chromosome X
which <- chromRange("X")
param <- ScanBamParam(flag=flag, which = which, what = what)
# scan the data
bam[[23]] <- scanBam(bamFile, param=param)[[1]]
}
bam
}

fragments2dataframe <- function(nbam, tbam, iSizeLim=c(75,750), gbuild="hg19") {
# normal fragment midpoints
nfmid <- lapply(nbam, function(x, ll, ul) {
x$isize <- abs(x$isize)
sort((pmin(x$pos, x$mpos) + x$isize/2)[x$isize >= ll & x$isize <= ul])
}, iSizeLim[1], iSizeLim[2])
# tumor fragment midpoints
tfmid <- lapply(tbam, function(x, ll, ul) {
x$isize <- abs(x$isize)
sort((pmin(x$pos, x$mpos) + x$isize/2)[x$isize >= ll & x$isize <= ul])
}, iSizeLim[1], iSizeLim[2])
# bin the mid points into interval of size 100
binnedcounts <- list()
nchr <- length(nbam)
# load blacklisted regions data
data(hgBlacklist, package="seqDNAcopy",envir=environment())
blgbuild <- paste(gbuild, "bl", sep="")
gbl <- get(blgbuild, envir=environment())
for(i in 1:nchr) binnedcounts[[i]] <- frags2counts(nfmid[[i]], tfmid[[i]], gbl[[i]][,1], gbl[[i]][,2])
# total number of bins with nonzero count
nbins <- unlist(lapply(binnedcounts, function(x) {length(x$pos)}))
# convert to matrix
fcounts <- matrix(0, sum(nbins), 4)
colnames(fcounts) <- c("chrom","pos","normal","tumor")
fcounts[,1] <- rep(1:nchr, nbins)
for(i in 1:nchr) {
ii <- fcounts[,1]==i
fcounts[ii,2] <- binnedcounts[[i]]$pos
fcounts[ii,3] <- binnedcounts[[i]]$normal
fcounts[ii,4] <- binnedcounts[[i]]$tumor
}
# return data
as.data.frame(fcounts)
}

bams2counts <- function(nBamFile, tBamFile, GCcorrect=TRUE, gbuild="hg19", mapq=20, iSizeLim=c(75,750), X=FALSE) {
# normal bam read data ("pos","mpos","isize")
nbam <- bam2fragments(nBamFile, X, mapq)
# tumor bam read data ("pos","mpos","isize")
tbam <- bam2fragments(tBamFile, X, mapq)
# get the fragment count in bins centered every 100 bases
out <- fragments2dataframe(nbam, tbam, iSizeLim, gbuild)
if (GCcorrect) {
# gc percentage
gcpct <- rep(NA_real_, nrow(out))
# get GC percentages from pctGCdata package
# loop thru chromosomes
nchr <- max(out$chrom) # IMPACT doesn't have X so only 22
for (i in 1:nchr) {
ii <- which(out$chrom==i)
# allow for chromosomes with no SNPs i.e. not targeted
if (length(ii) > 0) {
gcpct[ii] <- getGCpct(i, out$pos[ii], gbuild)
}
}
# GC correction
tscl <- sum(out$normal)/sum(out$tumor)
# normal and tumor fragment counts by GC percent level
ncount <- tapply(out$normal, gcpct, sum)
tcount <- tapply(out$tumor, gcpct, sum)
# log-ratio of tumor to normal counts as function of GC percentages
# standardized by the ratio of library sizes
logtn <- log2(tcount*tscl) - log2(ncount)
# percent GC
pctgc <- as.numeric(names(logtn))
# weights for each GC percent level
wts <- log2(tcount+ncount)
# bins that have non-zero count for both tumor and normal
ii <- which(is.finite(logtn))
# gc bias estimated using loess
gcb <- loess(logtn[ii] ~ pctgc[ii], weights=wts[ii], span=0.25)
# GC corrected library scaled tumor counts
jj <- match(gcpct, gcb$x)
gcscl <- 2^{-gcb$fitted[jj]}
# gcscl is NA if a bin has NA for gcpct; make it 1
gcscl[is.na(gcscl)] <- 1
# scale tumor counts by gcscl
out$tumor <- gcscl*out$tumor
}
out
}
27 changes: 27 additions & 0 deletions R/seqsegment.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
seqsegment <-
function(fcounts, sampleid="SeqSample", minBinCount=15, binSize=1000) {
# collapse the 100 base bins according to binsize
fcounts$bins <- fcounts$chrom + floor(fcounts$pos/binSize)*binSize/2^28
# data corresponding to the new bins
zz0 <- list()
zz0$chrom <- tapply(fcounts$chrom, fcounts$bins, function(x){x[1]})
zz0$pos <- tapply(fcounts$pos, fcounts$bins, function(x){x[1]})
zz0$pos <- binSize*(floor(zz0$pos/binSize) + 0.5)/1e6
zz0$normal <- tapply(fcounts$normal, fcounts$bins, sum)
zz0$tumor <- tapply(fcounts$tumor, fcounts$bins, sum)
zz0 <- as.data.frame(zz0)
# counts to segments
zz <- zz0[zz0[,3]>minBinCount,]
# scale the read counts
tscl <- sum(zz[,"normal"])/sum(zz[,"tumor"])
zchr <- zz[,"chrom"]
zpos <- zz[,"pos"]
# log-ratio
zlr <- log2(zz[,"tumor"]*tscl+1) - log2(zz[,"normal"]+1)
# weights
zwts <- log2(zz[,"normal"]+1-minBinCount)

zcna <- CNA(as.matrix(zlr), zchr, zpos, sampleid=sampleid)
zout <- segment(zcna, weights=zwts)
zout
}
Binary file added data/hgBlacklist.rda
Binary file not shown.
31 changes: 31 additions & 0 deletions man/hgBlacklist.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
\name{hgBlacklist}
\alias{hgBlacklist}
\alias{hg18bl}
\alias{hg19bl}
\alias{hg38bl}
\title{Blacklisted regions for human genome}
\description{
Regions in the genome that give sequencing artifacts as listed in
UCSC Genome browser track wgEncodeDacMapabilityConsensusExcludable
}
\usage{
data(hgBlacklist)
}
\format{
A list containing 25 matrices corresponding to chromosomes 1-22, X, Y &
MT in that order. Each row is an interval that is to be discarded.
}
\details{
The intervals for hg19 were obtained from UCSC Genome browser track
\url{https://genome.ucsc.edu/cgi-bin/hgFileUi?db=hg19&g=wgEncodeMapability}
wgEncodeDacMapabilityConsensusExcludable. The intervals were padded
by 100 bases on both ends and an extra row (2^28, 2^28) added to each
chromosome for programming convenience. The hg18 and hg38 data were
obtained using liftover \url{https://genome.ucsc.edu/cgi-bin/hgLiftOver}.

A fragment whose mid-point lies in an interval is discarded from analysis.
}
\references{
\url{https://sites.google.com/site/anshulkundaje/projects/blacklists}
}
\keyword{datasets}
22 changes: 22 additions & 0 deletions man/seqDNAcopy-internal.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
\name{seqDNAcopy-internal}
\alias{bam2fragments}
\alias{bams2counts}
\alias{chromRange}
\alias{fragments2counts}
\alias{fragments2dataframe}
\alias{seqsegment}
\title{Internal seqDNAcopy functions}
\description{
Internal functions of package seqDNAcopy.
}
\usage{
chromRange(i, chr="")
bam2fragments(bamFile, X=FALSE, mapq=20)
frags2counts(nfmid, tfmid, rstart, rend)
fragments2dataframe(nbam, tbam, iSizeLim=c(75,750), gbuild="hg19")
bams2counts(nBamFile, tBamFile, GCcorrect=TRUE, gbuild="hg19", mapq=20,
iSizeLim=c(75,750), X=FALSE)
seqsegment(fcounts, sampleid="SeqSample", minBinCount=15, binSize=1000)
}
\details{These are not to be called directly by the user}
\keyword{internal}
25 changes: 25 additions & 0 deletions man/seqDNAcopy-package.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
\name{seqDNAcopy-package}
\alias{seqDNAcopy-package}
\alias{seqDNAcopy}
\docType{package}
\title{
\packageTitle{seqDNAcopy}
}
\description{
\packageDescription{seqDNAcopy}
}
\details{
seqDNAcopy implements the CBS algorithm for sequencing data. It uses
coverage depth for tumor and normal data to compute log-ratio which is
first normalized for GC content and then segmented using segment
function from DNAcopy.
}
\author{
\packageAuthor{seqDNAcopy}

Maintainer: \packageMaintainer{seqDNAcopy}
}
\references{
Need to be added
}
\keyword{ package }
21 changes: 21 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// This file was generated by Rcpp::compileAttributes
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#include <Rcpp.h>

using namespace Rcpp;

// frags2counts
List frags2counts(NumericVector nfmid, NumericVector tfmid, NumericVector rstart, NumericVector rend);
RcppExport SEXP seqDNAcopy_frags2counts(SEXP nfmidSEXP, SEXP tfmidSEXP, SEXP rstartSEXP, SEXP rendSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< NumericVector >::type nfmid(nfmidSEXP);
Rcpp::traits::input_parameter< NumericVector >::type tfmid(tfmidSEXP);
Rcpp::traits::input_parameter< NumericVector >::type rstart(rstartSEXP);
Rcpp::traits::input_parameter< NumericVector >::type rend(rendSEXP);
__result = Rcpp::wrap(frags2counts(nfmid, tfmid, rstart, rend));
return __result;
END_RCPP
}
107 changes: 107 additions & 0 deletions src/frags2counts.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#include <Rcpp.h>

using namespace Rcpp;

// [[Rcpp::export]]
List frags2counts(NumericVector nfmid, NumericVector tfmid, NumericVector rstart, NumericVector rend) {

// normal fragments that are discarded
int n = nfmid.size();
NumericVector ndiscard(n);

int i = 0; // first blacklisted region
for(int j=0; j < n; ++j) { // loop through fragments
if (nfmid[j] < rstart[i]) { // if fragmid before region start
ndiscard[j] = 0; // keep fragment
} else { // else check fragmid
if (nfmid[j] <= rend[i]) { // if fragmid within region
ndiscard[j] = 1; // discard fragment
} else { // find the next region at or past it
while (nfmid[j] > rend[i]) { // while fragmid exceeds region end
i = i+1; // go to next region
} // region change complete
if (nfmid[j] < rstart[i]) { // if fragmid before region start
ndiscard[j] = 0; // keep fragment
} else { // else discard it
ndiscard[j] = 1; // since we know nfmid[j] <= rend[i]
}
}
}
}

// tumor fragments that are discarded
n = tfmid.size();
NumericVector tdiscard(n);

i = 0; // first blacklisted region
for(int j=0; j < n; ++j) { // loop through fragments
if (tfmid[j] < rstart[i]) { // if fragmid before region start
tdiscard[j] = 0; // keep fragment
} else { // else check fragmid
if (tfmid[j] <= rend[i]) { // if fragmid within region
tdiscard[j] = 1; // discard fragment
} else { // find the next region at or past it
while (tfmid[j] > rend[i]) { // while fragmid exceeds region end
i = i+1; // go to next region
} // region change complete
if (tfmid[j] < rstart[i]) { // if fragmid before region start
tdiscard[j] = 0; // keep fragment
} else { // else discard it
tdiscard[j] = 1; // since we know tfmid[j] <= rend[i]
}
}
}
}

// chromosome < 2^28 bases; so at most 2^28/100 = 2684355 bins
// use the valid fragments to create a count matrix

int ll;
NumericVector ncount(2684355);
NumericVector tcount(2684355);
NumericVector keep(2684355); // bins with non-zero counts

for(int j=0; j < 2684355; ++j) {
ncount[j] = 0; // initialize normal count to 0
tcount[j] = 0; // initialize tumor count to 0
keep[j] = 0; // initialize keep to 0
}

n = nfmid.size();
for(int j=0; j < n; ++j) { // loop through normal fragments
if (ndiscard[j] == 0) {
ll = floor(nfmid[j]/100);
ncount[ll] += 1;
keep[ll] = 1;
}
}

n = tfmid.size();
for(int j=0; j < n; ++j) { // loop through tumor fragments
if (tdiscard[j] == 0) {
ll = floor(tfmid[j]/100);
tcount[ll] += 1;
keep[ll] = 1;
}
}

int nnzbins = 0; // non-zero bin counter
for(int j=0; j < 2684355; ++j) {
nnzbins += keep[j];
}

NumericVector pos(nnzbins);
NumericVector normal(nnzbins);
NumericVector tumor(nnzbins);
i = 0;
for(int j=0; j < 2684355; ++j) {
if (keep[j] == 1) {
pos[i] = 50 + 100*j;
normal[i] = ncount[j];
tumor[i] = tcount[j];
i += 1;
}
}

return List::create(_["pos"] = pos, _["normal"] = normal, _["tumor"] = tumor);
}

0 comments on commit deb0d88

Please sign in to comment.