Skip to content

Commit

Permalink
optimize parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
tobiasrausch committed Jun 7, 2024
1 parent b73803f commit 21f96f2
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 24 deletions.
16 changes: 8 additions & 8 deletions src/gtf.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,6 @@ namespace dicey {
boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "GTF feature parsing" << std::endl;

// Check gzip
if (!is_gz(c.gtfFile)) {
std::cerr << "GTF file is not gzipped!" << std::endl;
return 0;
}

// Map IDs to integer
typedef std::map<std::string, int32_t> TIdMap;
TIdMap idMap;
Expand All @@ -103,9 +97,12 @@ namespace dicey {
int32_t eid = 0;

// Parse GTF
std::ifstream file(c.gtfFile.string().c_str(), std::ios_base::in | std::ios_base::binary);
std::ifstream file;
boost::iostreams::filtering_streambuf<boost::iostreams::input> dataIn;
dataIn.push(boost::iostreams::gzip_decompressor());
if (is_gz(c.gtfFile)) {
file.open(c.gtfFile.string().c_str(), std::ios_base::in | std::ios_base::binary);
dataIn.push(boost::iostreams::gzip_decompressor(), 16*1024);
} else file.open(c.gtfFile.string().c_str(), std::ios_base::in);
dataIn.push(file);
std::istream instream(&dataIn);
std::string gline;
Expand Down Expand Up @@ -215,6 +212,9 @@ namespace dicey {
}
}
}
dataIn.pop();
if (is_gz(c.gtfFile)) dataIn.pop();
file.close();
return geneInfo.size();
}

Expand Down
51 changes: 35 additions & 16 deletions src/padlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,29 +182,48 @@ namespace dicey
typedef std::vector<GeneInfo> TGeneInfo;
TGeneInfo geneInfo;
parseGTF(c, gRegions, geneInfo);

// Load barcodes
std::cout << '[' << boost::posix_time::to_simple_string(boost::posix_time::second_clock::local_time()) << "] " << "Load barcodes" << std::endl;
uint32_t numBarcodes = 0;
if (!numBarcodes) {
faidx_t* fai = fai_load(c.barcodes.string().c_str());
for(int32_t refIndex = 0; refIndex < faidx_nseq(fai); ++refIndex) {
int32_t seqlen;
char* seq = faidx_fetch_seq(fai, faidx_iseq(fai, refIndex), 0, 20, &seqlen);
if (seqlen != 20) {
std::cerr << "Invalid barcode length! Should be 20bp." << std::endl;
return 1;
}
if (numBarcodes < geneInfo.size()) {
geneInfo[numBarcodes].barcode = boost::to_upper_copy(std::string(seq));
geneInfo[numBarcodes].code = std::string(faidx_iseq(fai, refIndex));
// Open barcode file
std::ifstream barFile;
boost::iostreams::filtering_streambuf<boost::iostreams::input> dataIn;
if (is_gz(c.barcodes)) {
barFile.open(c.barcodes.string().c_str(), std::ios_base::in | std::ios_base::binary);
dataIn.push(boost::iostreams::gzip_decompressor(), 16*1024);
} else barFile.open(c.barcodes.string().c_str(), std::ios_base::in);
dataIn.push(barFile);
std::istream instream(&dataIn);
std::string line;
uint64_t lcount = 0;
std::string colcode;
std::string barcode;
while(std::getline(instream, line)) {
if (!line.empty()) {
if (lcount % 2 == 0) {
if (line[0] == '>') {
if (line.at(line.length() - 1) == '\r' ) colcode = line.substr(1, line.length() - 2);
else colcode = line.substr(1);
}
} else {
if (line.at(line.length() - 1) == '\r' ) barcode = boost::to_upper_copy(line.substr(0, line.length() - 1));
else barcode = boost::to_upper_copy(line);
if (numBarcodes < geneInfo.size()) {
geneInfo[numBarcodes].barcode = barcode;
geneInfo[numBarcodes].code = colcode;
++numBarcodes;
} else break;
}
++lcount;
}
++numBarcodes;
free(seq);
}
fai_destroy(fai);
dataIn.pop();
if (is_gz(c.barcodes)) dataIn.pop();
barFile.close();
}

// Reference index
csa_wt<> fm_index;
boost::filesystem::path op = c.genome.parent_path() / c.genome.stem();
Expand Down

0 comments on commit 21f96f2

Please sign in to comment.