From 21f96f2862ce0e3f23fbcb7306d9e8fe494fd37d Mon Sep 17 00:00:00 2001 From: Tobias Rausch Date: Fri, 7 Jun 2024 11:39:39 +0200 Subject: [PATCH] optimize parsing --- src/gtf.h | 16 ++++++++-------- src/padlock.h | 51 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/src/gtf.h b/src/gtf.h index 7a1b1eb..ff53663 100644 --- a/src/gtf.h +++ b/src/gtf.h @@ -89,12 +89,6 @@ namespace dicey { boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "GTF feature parsing" << std::endl; - // Check gzip - if (!is_gz(c.gtfFile)) { - std::cerr << "GTF file is not gzipped!" << std::endl; - return 0; - } - // Map IDs to integer typedef std::map TIdMap; TIdMap idMap; @@ -103,9 +97,12 @@ namespace dicey { int32_t eid = 0; // Parse GTF - std::ifstream file(c.gtfFile.string().c_str(), std::ios_base::in | std::ios_base::binary); + std::ifstream file; boost::iostreams::filtering_streambuf dataIn; - dataIn.push(boost::iostreams::gzip_decompressor()); + if (is_gz(c.gtfFile)) { + file.open(c.gtfFile.string().c_str(), std::ios_base::in | std::ios_base::binary); + dataIn.push(boost::iostreams::gzip_decompressor(), 16*1024); + } else file.open(c.gtfFile.string().c_str(), std::ios_base::in); dataIn.push(file); std::istream instream(&dataIn); std::string gline; @@ -215,6 +212,9 @@ namespace dicey { } } } + dataIn.pop(); + if (is_gz(c.gtfFile)) dataIn.pop(); + file.close(); return geneInfo.size(); } diff --git a/src/padlock.h b/src/padlock.h index 53062ec..68c7fb3 100644 --- a/src/padlock.h +++ b/src/padlock.h @@ -182,29 +182,48 @@ namespace dicey typedef std::vector TGeneInfo; TGeneInfo geneInfo; parseGTF(c, gRegions, geneInfo); - + // Load barcodes std::cout << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] " << "Load barcodes" << std::endl; uint32_t numBarcodes = 0; if (!numBarcodes) { - faidx_t* fai = fai_load(c.barcodes.string().c_str()); - for(int32_t refIndex = 0; refIndex < faidx_nseq(fai); ++refIndex) { - int32_t seqlen; - char* seq = faidx_fetch_seq(fai, faidx_iseq(fai, refIndex), 0, 20, &seqlen); - if (seqlen != 20) { - std::cerr << "Invalid barcode length! Should be 20bp." << std::endl; - return 1; - } - if (numBarcodes < geneInfo.size()) { - geneInfo[numBarcodes].barcode = boost::to_upper_copy(std::string(seq)); - geneInfo[numBarcodes].code = std::string(faidx_iseq(fai, refIndex)); + // Open barcode file + std::ifstream barFile; + boost::iostreams::filtering_streambuf dataIn; + if (is_gz(c.barcodes)) { + barFile.open(c.barcodes.string().c_str(), std::ios_base::in | std::ios_base::binary); + dataIn.push(boost::iostreams::gzip_decompressor(), 16*1024); + } else barFile.open(c.barcodes.string().c_str(), std::ios_base::in); + dataIn.push(barFile); + std::istream instream(&dataIn); + std::string line; + uint64_t lcount = 0; + std::string colcode; + std::string barcode; + while(std::getline(instream, line)) { + if (!line.empty()) { + if (lcount % 2 == 0) { + if (line[0] == '>') { + if (line.at(line.length() - 1) == '\r' ) colcode = line.substr(1, line.length() - 2); + else colcode = line.substr(1); + } + } else { + if (line.at(line.length() - 1) == '\r' ) barcode = boost::to_upper_copy(line.substr(0, line.length() - 1)); + else barcode = boost::to_upper_copy(line); + if (numBarcodes < geneInfo.size()) { + geneInfo[numBarcodes].barcode = barcode; + geneInfo[numBarcodes].code = colcode; + ++numBarcodes; + } else break; + } + ++lcount; } - ++numBarcodes; - free(seq); } - fai_destroy(fai); + dataIn.pop(); + if (is_gz(c.barcodes)) dataIn.pop(); + barFile.close(); } - + // Reference index csa_wt<> fm_index; boost::filesystem::path op = c.genome.parent_path() / c.genome.stem();