Skip to content

Commit

Permalink
add functionality for parsing BISCUIT generated BAM files. add user p…
Browse files Browse the repository at this point in the history
…arameter to define which aligner was used
  • Loading branch information
EvgOz committed May 9, 2024
1 parent 4767f58 commit 1f8c27b
Show file tree
Hide file tree
Showing 10 changed files with 246 additions and 19 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: fetchNOMe
Type: Package
Title: Package for fast and convenient retrieval of NOMe-seq data from BAM files
Version: 0.1
Date: 2023-10-02
Version: 0.2
Date: 2024-05-09
Authors@R: person(given = "Evgeniy A.", family="Ozonov", email = "evgeniy.ozonov@fmi.ch", role = c("aut", "cre"))
Description: This R package is tailored for fast and convenient retrieval of NOMe-seq data from BAM files aligned with QuasR, Bismark and BISCUIT.
License: MIT + file LICENSE
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax) {
.Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax)
fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) {
.Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed)
}

11 changes: 9 additions & 2 deletions R/get_data_matrix_from_bams.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#' \item{"otherC"}{methylation states of all other cytosines which do not fall into above categories.}
#' \item{"allC"}{methylation states of all cytosines combined.}
#' }
#'
#' @param alignerUsed \code{character} that defines which aligner was used to generate BAM files. Currently supported aligners are QuasR, Bismark and BISQUIT.
#' @param collapseBySample \code{logical} indicating whether to collapse counts for bam files with same \code{samplenames}. If \code{FALSE} prefix \code{s} followed by index is added to \code{samplenames}.
#' @param remove_nonunique \code{logical} if \code{TRUE} only unique fragments are analyzed. Uniqueness is defined by fragments start, end and methylation states of all cytosines.
#' @param clip_until_nbg \code{integer} controlling clipping partial footprints at both ends of fragments. Namely, protected states are erased from each end until \code{clip_until_nbg} unprotected states are met.
Expand Down Expand Up @@ -53,6 +53,7 @@ get_data_matrix_from_bams <- function(bamfiles,
regions,
genome,
whichContext = c("GCH","WCG","bisC","otherC", "allC"),
alignerUsed = c("QuasR","Bismark","BISQUIT"),
collapseBySample = TRUE,
remove_nonunique = TRUE,
clip_until_nbg = 1L,
Expand Down Expand Up @@ -85,6 +86,11 @@ get_data_matrix_from_bams <- function(bamfiles,

whichContext <- match.arg(whichContext)

alignerUsed <- match.arg(alignerUsed)

message(paste0("Fetching data from BAM files generated by ",alignerUsed,"."))


## remove metadata from regions
GenomicRanges::mcols(regions) <- NULL

Expand Down Expand Up @@ -132,7 +138,8 @@ get_data_matrix_from_bams <- function(bamfiles,
max_bisC_meth = max_bisC_meth,
min_bisC_size = min_bisC_size,
mapqMin = mapqMin,
mapqMax = mapqMax)
mapqMax = mapqMax,
alignerUsed = alignerUsed)

data_mat <- outlist[["DataMatrix"]]
# invert rows if strand is -
Expand Down
3 changes: 3 additions & 0 deletions man/get_data_matrix_from_bams.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// fetch_data_matrix_from_bams_cpp
Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichContext, const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax);
RcppExport SEXP _fetchNOMe_fetch_data_matrix_from_bams_cpp(SEXP whichContextSEXP, SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP) {
Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichContext, const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::CharacterVector& alignerUsed);
RcppExport SEXP _fetchNOMe_fetch_data_matrix_from_bams_cpp(SEXP whichContextSEXP, SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP alignerUsedSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -32,13 +32,14 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_bisC_size(min_bisC_sizeSEXP);
Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMin(mapqMinSEXP);
Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMax(mapqMaxSEXP);
rcpp_result_gen = Rcpp::wrap(fetch_data_matrix_from_bams_cpp(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax));
Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type alignerUsed(alignerUsedSEXP);
rcpp_result_gen = Rcpp::wrap(fetch_data_matrix_from_bams_cpp(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_fetchNOMe_fetch_data_matrix_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_data_matrix_from_bams_cpp, 15},
{"_fetchNOMe_fetch_data_matrix_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_data_matrix_from_bams_cpp, 16},
{NULL, NULL, 0}
};

Expand Down
198 changes: 193 additions & 5 deletions src/fetch_data_from_bam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ bool fetch_data_from_bam(const string& bamfile,
const string& regionChr,
const int& regionStart,
const int& regionEnd,
const alignerType& aligner,
obj_pnts* pnts){


Expand All @@ -26,8 +27,27 @@ bool fetch_data_from_bam(const string& bamfile,
if(regionChr.compare(fin->header->target_name[tid]) != 0)
Rcpp::stop("could not find target '" + regionChr + "' in bam header of " + bamfile + ".\n");

// call collectRegionData on store alignments in region
bam_fetch(fin->x.bam, idx, tid, regionStart, regionEnd, pnts, &collectRegionData);

// define a pointer to a callback retrival function
int (*collectRegionData)(const bam1_t *hit, void *data);

// choose callback function for data parsing based on aligner used
switch(aligner) {
case QuasR:
collectRegionData = &collectRegionData_QsBism;
break;
case Bismark:
collectRegionData = &collectRegionData_QsBism;
break;
case BISQUIT:
collectRegionData = &collectRegionData_Bisq;
break;
default:
Rcpp::stop("Only alignments generated by QuasR, Bismark or BISQUIT can be analyzed\n");
}

// call corresponding retrieval function to store alignments in region
bam_fetch(fin->x.bam, idx, tid, regionStart, regionEnd, pnts, collectRegionData);

// clean bam file objects
bam_index_destroy(idx);
Expand All @@ -36,8 +56,8 @@ bool fetch_data_from_bam(const string& bamfile,
return(1);
}


static int collectRegionData(const bam1_t *hit, void *data) { // bam_fetch callback function that parses XM tag in bismark bam and adds cooccurrence counts
// bam_fetch callback function for methyl calling and data storing for BAMs produced by QuasR and Bismark
static int collectRegionData_QsBism(const bam1_t *hit, void *data) {

static obj_pnts *pdata = NULL;

Expand Down Expand Up @@ -67,7 +87,9 @@ static int collectRegionData(const bam1_t *hit, void *data) { // bam_fetch callb
} else{ // parsing first mate or SE data, strand determined by read itself
msmType = is_read_rev ? GtoA : CtoT;
}
// Rcpp::Rcout<<"msmType="<<msmType<<endl;

// for testing only
//Rcpp::Rcout<<"qname="<<qname<<"\nis_second="<<is_second<<"\nis_read_rev="<<is_read_rev<<"\nis_mate_rev="<<is_mate_rev<<"\nmsmType="<<msmType<<endl;
//
// // for testing only
// fragData frg;
Expand Down Expand Up @@ -175,6 +197,172 @@ static int collectRegionData(const bam1_t *hit, void *data) { // bam_fetch callb
}



// bam_fetch callback function for methyl calling and data storing for BAMs produced by BISQUIT
static int collectRegionData_Bisq(const bam1_t *hit, void *data) {

static obj_pnts *pdata = NULL;

pdata = (obj_pnts*) data;


// check if alignment is valid
if(!isValidAln(hit,pdata))
return(0);
// for testing only
//print_bs_alignment(hit,pdata);

// parse alignment and create vectors for regData
string qname = string(bam1_qname(hit)); // frag name
uint32_t *cigar = bam_get_cigar(hit); // cigar array
uint8_t *hitseq = bam1_seq(hit); // query sequence
int ref_start = hit -> core.pos; // leftmost position in reference
int ref_end = bam_calend(&(hit->core),cigar); // rightmost position in reference
bool is_read_rev = bam_is_rev(hit); // is read on "+" strand
bool is_mate_rev = bam_is_mrev(hit);
bool is_second = bam_is_second(hit); // is it a second mate

// extract YD tag
uint8_t* ydTag = bam_aux_get(hit, "YD");
char ydValue;
if (ydTag) {
// The first byte is the type identifier, followed by the value
char type = bam_aux_type(ydTag);
if (type == 'A') {
// Get value
ydValue = bam_aux2A(ydTag);

} else {
Rcpp::Rcout << "Unexpected type for YD tag" << std::endl;
}

} else {
Rcpp::Rcout << "No YD tag found. Either BAM file is corrupted or generated not by BISQUIT." << std::endl;
}


// choose which mismatches to analyze
seekMismType msmType;
if(ydValue == 'f'){
msmType = CtoT;
} else if(ydValue == 'r'){
msmType = GtoA;
} else {
Rcpp::Rcout << "Unexpected value for YD tag: " <<ydValue<< std::endl;
}

// for testing only
// Rcpp::Rcout<<"qname="<<qname<<"\nis_second="<<is_second<<"\nis_read_rev="<<is_read_rev<<"\nis_mate_rev="<<is_mate_rev<<"\nmsmType="<<msmType<<"\nYD tag="<<ydValue<<endl;
//
// // for testing only
// fragData frg;

// go across contexts and add data
vector<fragMapType > context_vec = {GCH, WCG, bisC, otherC};
for(int i = 0; i < context_vec.size(); ++i){
// get positions for the context in reference
vector<int> cntx_rpos_vec = pdata->refseq_info->getPositionsBetween(ref_start,ref_end,context_vec[i], msmType == CtoT);

vector<int > rpos_vec;
vector<bool > dat_vec;
vector<bool > isplusstrand_vec;

for(int cind = 0; cind < cntx_rpos_vec.size(); ++cind){

// get query position
int qpos = _fromRefToQueryPos(ref_start,
cntx_rpos_vec[cind],
hit->core.n_cigar,
cigar);
// get nucleotide in query
string qseq = nt16_table[bam1_seqi(hitseq, qpos)];

// check if C->T mismatch if msmType == CtoT or GtoA otherwise. query on - in BAM are reverse complement
switch(msmType) {
case CtoT:
{
if(qseq == "C"){ // methylated or unprotected
// add position on reference
rpos_vec.push_back(cntx_rpos_vec[cind]);
// add data
dat_vec.push_back(context_vec[i] == GCH ? 0 : 1);
// add strand
isplusstrand_vec.push_back(!is_read_rev);

} else if(qseq == "T"){ // unmethylated or protected
// add position on reference
rpos_vec.push_back(cntx_rpos_vec[cind]);
// add data
dat_vec.push_back(context_vec[i] == GCH ? 1 : 0);
// add strand
isplusstrand_vec.push_back(!is_read_rev);
}
break;
}
case GtoA:
{
if(qseq == "G"){ // methylated or unprotected
// add position on reference
rpos_vec.push_back(cntx_rpos_vec[cind]);
// add data
dat_vec.push_back(context_vec[i] == GCH ? 0 : 1);
// add strand
isplusstrand_vec.push_back(!is_read_rev);

} else if(qseq == "A"){ // unmethylated or protected
// add position on reference
rpos_vec.push_back(cntx_rpos_vec[cind]);
// add data
dat_vec.push_back(context_vec[i] == GCH ? 1 : 0);
// add strand
isplusstrand_vec.push_back(!is_read_rev);
}

break;
}
}


}




// add data for the current context into regionData
pdata->reg_data->addFragContextData(pdata->prefix,
pdata->prefix + qname,
ref_start,
ref_end,
rpos_vec,
dat_vec,
isplusstrand_vec,
is_second,
context_vec[i]);

// for testing only
// frg.addData(ref_start,
// ref_end,
// rpos_vec,
// dat_vec,
// isplusstrand_vec,
// is_second,
// context_vec[i]);

}






return(0);

}





// checks validity of an alignment
bool isValidAln(const bam1_t *hit,const obj_pnts* pdata){

Expand Down
10 changes: 8 additions & 2 deletions src/fetch_data_from_bam.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,16 @@ bool fetch_data_from_bam(const string& bamfile,
const string& regionChr,
const int& regionStart,
const int& regionEnd,
const alignerType& aligner,
obj_pnts* pnts);

// bam_fetch callback function for bam_fetch
static int collectRegionData(const bam1_t *hit, void *data);
// bam_fetch callback function for bam_fetch for data retrieval from BAM files produced by QuasR and Bismark
static int collectRegionData_QsBism(const bam1_t *hit, void *data);

// bam_fetch callback function for bam_fetch for data retrieval from BAM files produced by BISQUIT
static int collectRegionData_Bisq(const bam1_t *hit, void *data);



// checks validity of an alignment
bool isValidAln(const bam1_t *hit,const obj_pnts* pdata);
Expand Down
16 changes: 15 additions & 1 deletion src/functions_export_rcpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon
const Rcpp::NumericVector& max_bisC_meth,
const Rcpp::IntegerVector& min_bisC_size,
const Rcpp::IntegerVector& mapqMin,
const Rcpp::IntegerVector& mapqMax){
const Rcpp::IntegerVector& mapqMax,
const Rcpp::CharacterVector& alignerUsed){

// convert input parameters
string whichContext_ = Rcpp::as<string>(whichContext);
Expand All @@ -34,6 +35,18 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon
else
Rcpp::stop("whichContext must be GCH, WCG, bisC, otherC or allC");

string alignerUsed_ = Rcpp::as<string>(alignerUsed);
alignerType aligner;
if(alignerUsed_ == "QuasR")
aligner = QuasR;
else if(alignerUsed_ == "Bismark")
aligner = Bismark;
else if(alignerUsed_ == "BISQUIT")
aligner = BISQUIT;
else
Rcpp::stop("Only alignments generated by QuasR, Bismark or BISQUIT can be analyzed\n");


vector<string > infiles_ = Rcpp::as<vector<string > >(infiles);
string regionChr_ = Rcpp::as<string >(regionChr);
int regionStart_ = Rcpp::as<int >(regionStart);
Expand Down Expand Up @@ -74,6 +87,7 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon
regionChr_,
regionStart_,
regionEnd_,
aligner,
&pnts);
}

Expand Down
Loading

0 comments on commit 1f8c27b

Please sign in to comment.