-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit deb0d88
Showing
11 changed files
with
371 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
Package: seqDNAcopy | ||
Type: Package | ||
Title: Copy number data analysis using DNA sequencing data | ||
Version: 0.0.1 | ||
Date: 2017-01-20 | ||
Author: Venkatraman E. Seshan | ||
Maintainer: Venkatraman E. Seshan <seshanv@mskcc.org> | ||
Description: Implements the CBS algorithm for sequencing data. | ||
License: GPL (>= 2) | ||
biocViews: Microarray, CopyNumberVariation | ||
Depends: R (>= 2.10), pctGCdata (>= 0.2.0) | ||
Imports: Rcpp (>= 0.12.0), IRanges, DNAcopy, Rsamtools | ||
LinkingTo: Rcpp | ||
NeedsCompilation: yes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
useDynLib(seqDNAcopy) | ||
import("stats") | ||
importFrom("utils", "data") | ||
importFrom("Rcpp", "evalCpp") | ||
importFrom("IRanges", "IRanges", "RangesList") | ||
importFrom("DNAcopy", "CNA", "segment") | ||
importFrom("pctGCdata", "getGCpct") | ||
importFrom("Rsamtools", "ScanBamParam", "scanBam", "scanBamFlag") | ||
#exportPattern("^[[:alpha:]]+") | ||
export("bams2counts", "seqsegment") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# This file was generated by Rcpp::compileAttributes | ||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
frags2counts <- function(nfmid, tfmid, rstart, rend) { | ||
.Call('seqDNAcopy_frags2counts', PACKAGE = 'seqDNAcopy', nfmid, tfmid, rstart, rend) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
chromRange <- function(i, chr="") { | ||
eval(parse(text=paste('RangesList("', chr, i,'"=IRanges(start=0, end=268435456L))',sep=""))) | ||
} | ||
|
||
bam2fragments <- function(bamFile, X=FALSE, mapq=20) { | ||
# get position, mate position and insert size (fragment length) | ||
what <- c("pos","mpos","isize") | ||
# get first mate from properly paired reads which pass QC | ||
flag=scanBamFlag(isNotPassingQualityControls=FALSE, isPaired=TRUE, isFirstMateRead=TRUE, hasUnmappedMate=FALSE, isDuplicate=FALSE, isSecondaryAlignment=FALSE) | ||
bam <- list() | ||
# autosomes | ||
for(i in 1:22) { | ||
which <- chromRange(i) | ||
param <- ScanBamParam(flag=flag, which = which, what = what, mapqFilter=mapq) | ||
# scan the data | ||
bam[[i]] <- scanBam(bamFile, param=param)[[1]] | ||
} | ||
if (X) { | ||
# chromosome X | ||
which <- chromRange("X") | ||
param <- ScanBamParam(flag=flag, which = which, what = what) | ||
# scan the data | ||
bam[[23]] <- scanBam(bamFile, param=param)[[1]] | ||
} | ||
bam | ||
} | ||
|
||
fragments2dataframe <- function(nbam, tbam, iSizeLim=c(75,750), gbuild="hg19") { | ||
# normal fragment midpoints | ||
nfmid <- lapply(nbam, function(x, ll, ul) { | ||
x$isize <- abs(x$isize) | ||
sort((pmin(x$pos, x$mpos) + x$isize/2)[x$isize >= ll & x$isize <= ul]) | ||
}, iSizeLim[1], iSizeLim[2]) | ||
# tumor fragment midpoints | ||
tfmid <- lapply(tbam, function(x, ll, ul) { | ||
x$isize <- abs(x$isize) | ||
sort((pmin(x$pos, x$mpos) + x$isize/2)[x$isize >= ll & x$isize <= ul]) | ||
}, iSizeLim[1], iSizeLim[2]) | ||
# bin the mid points into interval of size 100 | ||
binnedcounts <- list() | ||
nchr <- length(nbam) | ||
# load blacklisted regions data | ||
data(hgBlacklist, package="seqDNAcopy",envir=environment()) | ||
blgbuild <- paste(gbuild, "bl", sep="") | ||
gbl <- get(blgbuild, envir=environment()) | ||
for(i in 1:nchr) binnedcounts[[i]] <- frags2counts(nfmid[[i]], tfmid[[i]], gbl[[i]][,1], gbl[[i]][,2]) | ||
# total number of bins with nonzero count | ||
nbins <- unlist(lapply(binnedcounts, function(x) {length(x$pos)})) | ||
# convert to matrix | ||
fcounts <- matrix(0, sum(nbins), 4) | ||
colnames(fcounts) <- c("chrom","pos","normal","tumor") | ||
fcounts[,1] <- rep(1:nchr, nbins) | ||
for(i in 1:nchr) { | ||
ii <- fcounts[,1]==i | ||
fcounts[ii,2] <- binnedcounts[[i]]$pos | ||
fcounts[ii,3] <- binnedcounts[[i]]$normal | ||
fcounts[ii,4] <- binnedcounts[[i]]$tumor | ||
} | ||
# return data | ||
as.data.frame(fcounts) | ||
} | ||
|
||
bams2counts <- function(nBamFile, tBamFile, GCcorrect=TRUE, gbuild="hg19", mapq=20, iSizeLim=c(75,750), X=FALSE) { | ||
# normal bam read data ("pos","mpos","isize") | ||
nbam <- bam2fragments(nBamFile, X, mapq) | ||
# tumor bam read data ("pos","mpos","isize") | ||
tbam <- bam2fragments(tBamFile, X, mapq) | ||
# get the fragment count in bins centered every 100 bases | ||
out <- fragments2dataframe(nbam, tbam, iSizeLim, gbuild) | ||
if (GCcorrect) { | ||
# gc percentage | ||
gcpct <- rep(NA_real_, nrow(out)) | ||
# get GC percentages from pctGCdata package | ||
# loop thru chromosomes | ||
nchr <- max(out$chrom) # IMPACT doesn't have X so only 22 | ||
for (i in 1:nchr) { | ||
ii <- which(out$chrom==i) | ||
# allow for chromosomes with no SNPs i.e. not targeted | ||
if (length(ii) > 0) { | ||
gcpct[ii] <- getGCpct(i, out$pos[ii], gbuild) | ||
} | ||
} | ||
# GC correction | ||
tscl <- sum(out$normal)/sum(out$tumor) | ||
# normal and tumor fragment counts by GC percent level | ||
ncount <- tapply(out$normal, gcpct, sum) | ||
tcount <- tapply(out$tumor, gcpct, sum) | ||
# log-ratio of tumor to normal counts as function of GC percentages | ||
# standardized by the ratio of library sizes | ||
logtn <- log2(tcount*tscl) - log2(ncount) | ||
# percent GC | ||
pctgc <- as.numeric(names(logtn)) | ||
# weights for each GC percent level | ||
wts <- log2(tcount+ncount) | ||
# bins that have non-zero count for both tumor and normal | ||
ii <- which(is.finite(logtn)) | ||
# gc bias estimated using loess | ||
gcb <- loess(logtn[ii] ~ pctgc[ii], weights=wts[ii], span=0.25) | ||
# GC corrected library scaled tumor counts | ||
jj <- match(gcpct, gcb$x) | ||
gcscl <- 2^{-gcb$fitted[jj]} | ||
# gcscl is NA if a bin has NA for gcpct; make it 1 | ||
gcscl[is.na(gcscl)] <- 1 | ||
# scale tumor counts by gcscl | ||
out$tumor <- gcscl*out$tumor | ||
} | ||
out | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
seqsegment <- | ||
function(fcounts, sampleid="SeqSample", minBinCount=15, binSize=1000) { | ||
# collapse the 100 base bins according to binsize | ||
fcounts$bins <- fcounts$chrom + floor(fcounts$pos/binSize)*binSize/2^28 | ||
# data corresponding to the new bins | ||
zz0 <- list() | ||
zz0$chrom <- tapply(fcounts$chrom, fcounts$bins, function(x){x[1]}) | ||
zz0$pos <- tapply(fcounts$pos, fcounts$bins, function(x){x[1]}) | ||
zz0$pos <- binSize*(floor(zz0$pos/binSize) + 0.5)/1e6 | ||
zz0$normal <- tapply(fcounts$normal, fcounts$bins, sum) | ||
zz0$tumor <- tapply(fcounts$tumor, fcounts$bins, sum) | ||
zz0 <- as.data.frame(zz0) | ||
# counts to segments | ||
zz <- zz0[zz0[,3]>minBinCount,] | ||
# scale the read counts | ||
tscl <- sum(zz[,"normal"])/sum(zz[,"tumor"]) | ||
zchr <- zz[,"chrom"] | ||
zpos <- zz[,"pos"] | ||
# log-ratio | ||
zlr <- log2(zz[,"tumor"]*tscl+1) - log2(zz[,"normal"]+1) | ||
# weights | ||
zwts <- log2(zz[,"normal"]+1-minBinCount) | ||
|
||
zcna <- CNA(as.matrix(zlr), zchr, zpos, sampleid=sampleid) | ||
zout <- segment(zcna, weights=zwts) | ||
zout | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
\name{hgBlacklist} | ||
\alias{hgBlacklist} | ||
\alias{hg18bl} | ||
\alias{hg19bl} | ||
\alias{hg38bl} | ||
\title{Blacklisted regions for human genome} | ||
\description{ | ||
Regions in the genome that give sequencing artifacts as listed in | ||
UCSC Genome browser track wgEncodeDacMapabilityConsensusExcludable | ||
} | ||
\usage{ | ||
data(hgBlacklist) | ||
} | ||
\format{ | ||
A list containing 25 matrices corresponding to chromosomes 1-22, X, Y & | ||
MT in that order. Each row is an interval that is to be discarded. | ||
} | ||
\details{ | ||
The intervals for hg19 were obtained from UCSC Genome browser track | ||
\url{https://genome.ucsc.edu/cgi-bin/hgFileUi?db=hg19&g=wgEncodeMapability} | ||
wgEncodeDacMapabilityConsensusExcludable. The intervals were padded | ||
by 100 bases on both ends and an extra row (2^28, 2^28) added to each | ||
chromosome for programming convenience. The hg18 and hg38 data were | ||
obtained using liftover \url{https://genome.ucsc.edu/cgi-bin/hgLiftOver}. | ||
|
||
A fragment whose mid-point lies in an interval is discarded from analysis. | ||
} | ||
\references{ | ||
\url{https://sites.google.com/site/anshulkundaje/projects/blacklists} | ||
} | ||
\keyword{datasets} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
\name{seqDNAcopy-internal} | ||
\alias{bam2fragments} | ||
\alias{bams2counts} | ||
\alias{chromRange} | ||
\alias{fragments2counts} | ||
\alias{fragments2dataframe} | ||
\alias{seqsegment} | ||
\title{Internal seqDNAcopy functions} | ||
\description{ | ||
Internal functions of package seqDNAcopy. | ||
} | ||
\usage{ | ||
chromRange(i, chr="") | ||
bam2fragments(bamFile, X=FALSE, mapq=20) | ||
frags2counts(nfmid, tfmid, rstart, rend) | ||
fragments2dataframe(nbam, tbam, iSizeLim=c(75,750), gbuild="hg19") | ||
bams2counts(nBamFile, tBamFile, GCcorrect=TRUE, gbuild="hg19", mapq=20, | ||
iSizeLim=c(75,750), X=FALSE) | ||
seqsegment(fcounts, sampleid="SeqSample", minBinCount=15, binSize=1000) | ||
} | ||
\details{These are not to be called directly by the user} | ||
\keyword{internal} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
\name{seqDNAcopy-package} | ||
\alias{seqDNAcopy-package} | ||
\alias{seqDNAcopy} | ||
\docType{package} | ||
\title{ | ||
\packageTitle{seqDNAcopy} | ||
} | ||
\description{ | ||
\packageDescription{seqDNAcopy} | ||
} | ||
\details{ | ||
seqDNAcopy implements the CBS algorithm for sequencing data. It uses | ||
coverage depth for tumor and normal data to compute log-ratio which is | ||
first normalized for GC content and then segmented using segment | ||
function from DNAcopy. | ||
} | ||
\author{ | ||
\packageAuthor{seqDNAcopy} | ||
|
||
Maintainer: \packageMaintainer{seqDNAcopy} | ||
} | ||
\references{ | ||
Need to be added | ||
} | ||
\keyword{ package } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
// This file was generated by Rcpp::compileAttributes | ||
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
#include <Rcpp.h> | ||
|
||
using namespace Rcpp; | ||
|
||
// frags2counts | ||
List frags2counts(NumericVector nfmid, NumericVector tfmid, NumericVector rstart, NumericVector rend); | ||
RcppExport SEXP seqDNAcopy_frags2counts(SEXP nfmidSEXP, SEXP tfmidSEXP, SEXP rstartSEXP, SEXP rendSEXP) { | ||
BEGIN_RCPP | ||
Rcpp::RObject __result; | ||
Rcpp::RNGScope __rngScope; | ||
Rcpp::traits::input_parameter< NumericVector >::type nfmid(nfmidSEXP); | ||
Rcpp::traits::input_parameter< NumericVector >::type tfmid(tfmidSEXP); | ||
Rcpp::traits::input_parameter< NumericVector >::type rstart(rstartSEXP); | ||
Rcpp::traits::input_parameter< NumericVector >::type rend(rendSEXP); | ||
__result = Rcpp::wrap(frags2counts(nfmid, tfmid, rstart, rend)); | ||
return __result; | ||
END_RCPP | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#include <Rcpp.h> | ||
|
||
using namespace Rcpp; | ||
|
||
// [[Rcpp::export]] | ||
List frags2counts(NumericVector nfmid, NumericVector tfmid, NumericVector rstart, NumericVector rend) { | ||
|
||
// normal fragments that are discarded | ||
int n = nfmid.size(); | ||
NumericVector ndiscard(n); | ||
|
||
int i = 0; // first blacklisted region | ||
for(int j=0; j < n; ++j) { // loop through fragments | ||
if (nfmid[j] < rstart[i]) { // if fragmid before region start | ||
ndiscard[j] = 0; // keep fragment | ||
} else { // else check fragmid | ||
if (nfmid[j] <= rend[i]) { // if fragmid within region | ||
ndiscard[j] = 1; // discard fragment | ||
} else { // find the next region at or past it | ||
while (nfmid[j] > rend[i]) { // while fragmid exceeds region end | ||
i = i+1; // go to next region | ||
} // region change complete | ||
if (nfmid[j] < rstart[i]) { // if fragmid before region start | ||
ndiscard[j] = 0; // keep fragment | ||
} else { // else discard it | ||
ndiscard[j] = 1; // since we know nfmid[j] <= rend[i] | ||
} | ||
} | ||
} | ||
} | ||
|
||
// tumor fragments that are discarded | ||
n = tfmid.size(); | ||
NumericVector tdiscard(n); | ||
|
||
i = 0; // first blacklisted region | ||
for(int j=0; j < n; ++j) { // loop through fragments | ||
if (tfmid[j] < rstart[i]) { // if fragmid before region start | ||
tdiscard[j] = 0; // keep fragment | ||
} else { // else check fragmid | ||
if (tfmid[j] <= rend[i]) { // if fragmid within region | ||
tdiscard[j] = 1; // discard fragment | ||
} else { // find the next region at or past it | ||
while (tfmid[j] > rend[i]) { // while fragmid exceeds region end | ||
i = i+1; // go to next region | ||
} // region change complete | ||
if (tfmid[j] < rstart[i]) { // if fragmid before region start | ||
tdiscard[j] = 0; // keep fragment | ||
} else { // else discard it | ||
tdiscard[j] = 1; // since we know tfmid[j] <= rend[i] | ||
} | ||
} | ||
} | ||
} | ||
|
||
// chromosome < 2^28 bases; so at most 2^28/100 = 2684355 bins | ||
// use the valid fragments to create a count matrix | ||
|
||
int ll; | ||
NumericVector ncount(2684355); | ||
NumericVector tcount(2684355); | ||
NumericVector keep(2684355); // bins with non-zero counts | ||
|
||
for(int j=0; j < 2684355; ++j) { | ||
ncount[j] = 0; // initialize normal count to 0 | ||
tcount[j] = 0; // initialize tumor count to 0 | ||
keep[j] = 0; // initialize keep to 0 | ||
} | ||
|
||
n = nfmid.size(); | ||
for(int j=0; j < n; ++j) { // loop through normal fragments | ||
if (ndiscard[j] == 0) { | ||
ll = floor(nfmid[j]/100); | ||
ncount[ll] += 1; | ||
keep[ll] = 1; | ||
} | ||
} | ||
|
||
n = tfmid.size(); | ||
for(int j=0; j < n; ++j) { // loop through tumor fragments | ||
if (tdiscard[j] == 0) { | ||
ll = floor(tfmid[j]/100); | ||
tcount[ll] += 1; | ||
keep[ll] = 1; | ||
} | ||
} | ||
|
||
int nnzbins = 0; // non-zero bin counter | ||
for(int j=0; j < 2684355; ++j) { | ||
nnzbins += keep[j]; | ||
} | ||
|
||
NumericVector pos(nnzbins); | ||
NumericVector normal(nnzbins); | ||
NumericVector tumor(nnzbins); | ||
i = 0; | ||
for(int j=0; j < 2684355; ++j) { | ||
if (keep[j] == 1) { | ||
pos[i] = 50 + 100*j; | ||
normal[i] = ncount[j]; | ||
tumor[i] = tcount[j]; | ||
i += 1; | ||
} | ||
} | ||
|
||
return List::create(_["pos"] = pos, _["normal"] = normal, _["tumor"] = tumor); | ||
} |