Skip to content

Commit

Permalink
Merge pull request #4188 from vgteam/deconstruct
Browse files Browse the repository at this point in the history
add chains stats (-C)
  • Loading branch information
glennhickey authored Jan 15, 2024
2 parents 3566725 + 38f7824 commit 40cc426
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 56 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Please cite:
* [The GBZ Paper](https://doi.org/10.1093/bioinformatics/btad097) when using GBZ
* [The HPRC Paper](https://doi.org/10.1038/s41586-023-05896-x) when using `vg deconstruct`
* [The Snarls Paper](https://doi.org/10.1089/cmb.2017.0251) when using `vg snarls`
* [The Personalized Pangenome Paper](https://doi.org/10.1101/2023.12.13.571553) when using `vg haplotypes` and/or `vg giraffe --haplotype-name`

## Support

Expand Down
6 changes: 3 additions & 3 deletions scripts/mcmc_Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ clean:
rm -r $(TOIL_OS)

CHR21.fa:
wget https://courtyard.gi.ucsc.edu/~anovak/vg-data/bakeoff/CHR21.fa
wget https://public.gi.ucsc.edu/~anovak/vg-data/bakeoff/CHR21.fa

1kg_hg19-CHR21.vcf.gz:
wget https://courtyard.gi.ucsc.edu/~anovak/vg-data/bakeoff/1kg_hg19-CHR21.vcf.gz
wget https://public.gi.ucsc.edu/~anovak/vg-data/bakeoff/1kg_hg19-CHR21.vcf.gz

1kg_hg19-CHR21.vcf.gz.tbi:
wget https://courtyard.gi.ucsc.edu/~anovak/vg-data/bakeoff/1kg_hg19-CHR21.vcf.gz.tbi
wget https://public.gi.ucsc.edu/~anovak/vg-data/bakeoff/1kg_hg19-CHR21.vcf.gz.tbi
156 changes: 105 additions & 51 deletions src/subcommand/stats_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ void help_stats(char** argv) {
<< " multiple allowed; limit comparison to those provided" << endl
<< " -O, --overlap-all print overlap table for the cartesian product of paths" << endl
<< " -R, --snarls print statistics for each snarl" << endl
<< " -C, --chains print statistics for each chain" << endl
<< " -F, --format graph format from {VG-Protobuf, PackedGraph, HashGraph, XG}. " <<
"Can't detect Protobuf if graph read from stdin" << endl
<< " -D, --degree-dist print degree distribution of the graph." << endl
Expand Down Expand Up @@ -102,6 +103,7 @@ int main_stats(int argc, char** argv) {
vector<string> paths_to_overlap;
bool overlap_all_paths = false;
bool snarl_stats = false;
bool chain_stats = false;
bool format = false;
bool degree_dist = false;
string distance_index_filename;
Expand Down Expand Up @@ -132,6 +134,7 @@ int main_stats(int argc, char** argv) {
{"overlap", no_argument, 0, 'o'},
{"overlap-all", no_argument, 0, 'O'},
{"snarls", no_argument, 0, 'R'},
{"chains", no_argument, 0, 'C'},
{"format", no_argument, 0, 'F'},
{"degree-dist", no_argument, 0, 'D'},
{"dist-snarls", required_argument, 0, 'b'},
Expand All @@ -140,7 +143,7 @@ int main_stats(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hzlLsHTecdtn:NEa:vAro:ORFDb:p:",
c = getopt_long (argc, argv, "hzlLsHTecdtn:NEa:vAro:ORCFDb:p:",
long_options, &option_index);

// Detect the end of the options.
Expand Down Expand Up @@ -229,6 +232,10 @@ int main_stats(int argc, char** argv) {
snarl_stats = true;
break;

case 'C':
chain_stats = true;
break;

case 'v':
verbose = true;
break;
Expand Down Expand Up @@ -1088,49 +1095,55 @@ int main_stats(int argc, char** argv) {


}

SnarlManager manager; // todo: option to read snarls
// We will track depth for each snarl (used for both snarl and chains stats)
unordered_map<const Snarl*, size_t> depth;

if (snarl_stats) {
if (snarl_stats || chain_stats) {
// We will go through all the snarls and compute stats.

require_graph();

// First compute the snarls
auto manager = IntegratedSnarlFinder(*graph).find_snarls_parallel();
manager = IntegratedSnarlFinder(*graph).find_snarls_parallel();

// We will track depth for each snarl
unordered_map<const Snarl*, size_t> depth;

// TSV header
cout << "Start\tStart-Reversed\tEnd\tEnd-Reversed\tUltrabubble\tUnary\tShallow-Nodes\tShallow-Edges\tShallow-bases\tDeep-Nodes\tDeep-Edges\tDeep-Bases\tDepth\tChildren\tChains\tChains-Children\tNet-Graph-Size\n";
if (snarl_stats) {
// TSV header
cout << "Start\tStart-Reversed\tEnd\tEnd-Reversed\tUltrabubble\tUnary\tShallow-Nodes\tShallow-Edges\tShallow-bases\tDeep-Nodes\tDeep-Edges\tDeep-Bases\tDepth\tChildren\tChains\tChains-Children\tNet-Graph-Size\n";
}

manager.for_each_snarl_preorder([&](const Snarl* snarl) {
// Loop over all the snarls and print stats.

// snarl
cout << snarl->start().node_id() << "\t" << snarl->start().backward() << "\t";
cout << snarl->end().node_id() << "\t" << snarl->end().backward() << "\t";
if (snarl_stats) {
// snarl
cout << snarl->start().node_id() << "\t" << snarl->start().backward() << "\t";
cout << snarl->end().node_id() << "\t" << snarl->end().backward() << "\t";

// Snarl metadata
cout << (snarl->type() == ULTRABUBBLE) << "\t";
cout << (snarl->type() == UNARY) << "\t";

// Snarl size not including boundary nodes
pair<unordered_set<vg::id_t>, unordered_set<vg::edge_t> > contents = manager.shallow_contents(snarl, *graph, false);
size_t num_bases = 0;
for (vg::id_t node_id : contents.first) {
num_bases += graph->get_length(graph->get_handle(node_id));
}
cout << contents.first.size() << "\t";
cout << contents.second.size() << "\t";
cout << num_bases << "\t";
contents = manager.deep_contents(snarl, *graph, false);
num_bases = 0;
for (vg::id_t node_id : contents.first) {
num_bases += graph->get_length(graph->get_handle(node_id));
// Snarl metadata
cout << (snarl->type() == ULTRABUBBLE) << "\t";
cout << (snarl->type() == UNARY) << "\t";

// Snarl size not including boundary nodes
pair<unordered_set<vg::id_t>, unordered_set<vg::edge_t> > contents = manager.shallow_contents(snarl, *graph, false);
size_t num_bases = 0;
for (vg::id_t node_id : contents.first) {
num_bases += graph->get_length(graph->get_handle(node_id));
}
cout << contents.first.size() << "\t";
cout << contents.second.size() << "\t";
cout << num_bases << "\t";
contents = manager.deep_contents(snarl, *graph, false);
num_bases = 0;
for (vg::id_t node_id : contents.first) {
num_bases += graph->get_length(graph->get_handle(node_id));
}
cout << contents.first.size() << "\t";
cout << contents.second.size() << "\t";
cout << num_bases << "\t";
}
cout << contents.first.size() << "\t";
cout << contents.second.size() << "\t";
cout << num_bases << "\t";

// Compute depth
auto parent = manager.parent_of(snarl);
Expand All @@ -1140,35 +1153,76 @@ int main_stats(int argc, char** argv) {
} else {
depth[snarl] = depth[parent] + 1;
}
cout << depth[snarl] << "\t";

if (snarl_stats) {
cout << depth[snarl] << "\t";

// Number of children (looking inside chains)
cout << manager.children_of(snarl).size() << "\t";
// Number of children (looking inside chains)
cout << manager.children_of(snarl).size() << "\t";

// Number of chains (including unary child snarls)
// Will be 0 for leaves
auto chains = manager.chains_of(snarl);
cout << chains.size() << "\t";

for (size_t i = 0; i < chains.size(); ++i) {
// Number of children in each chain
cout << chains[i].size();
if (i < chains.size() - 1) {
cout << ",";
// Number of chains (including unary child snarls)
// Will be 0 for leaves
auto chains = manager.chains_of(snarl);
cout << chains.size() << "\t";

for (size_t i = 0; i < chains.size(); ++i) {
// Number of children in each chain
cout << chains[i].size();
if (i < chains.size() - 1) {
cout << ",";
}
}
if (chains.empty()) {
cout << "0";
}
cout << "\t";

// Net graph info
// Internal connectivity not important, we just want the size.
auto netGraph = manager.net_graph_of(snarl, graph, false);
cout << netGraph.get_node_count() << endl;
}
if (chains.empty()) {
cout << "0";
});

}

if (chain_stats) {
// We will go through all the chains and compute stats.


// TSV header
cout << "Snarl1-Start\tSSnarl1-Start-Reversed\tSnarl1-End\tSnarl1-End-Reversed\tSnarl1-Reversed\tSnarl2-Start\tSSnarl2-Start-Reversed\tSnarl2-End\tSnarl2-End-Reversed\tSnarl2-Reversed\tSnarl-Count\tMax-Depth\tChild-Snarls\tChild-Chains\n";

manager.for_each_chain([&](const Chain* chain) {
// Loop over all the snarls and print stats.

// snarl endpoints
cout << chain->front().first->start().node_id() << "\t" << chain->front().first->start().backward() << "\t"
<< chain->front().first->end().node_id() << "\t" << chain->front().first->end().backward() << "\t"
<< chain->front().second << "\t";
cout << chain->back().first->start().node_id() << "\t" << chain->back().first->start().backward() << "\t"
<< chain->back().first->end().node_id() << "\t" << chain->back().first->end().backward() << "\t"
<< chain->back().second << "\t";

// snarl count
cout << chain->size() << "\t";

int64_t max_depth = 0;
int64_t child_snarls = 0;
int64_t child_chains = 0;
for (const auto& sr : *chain) {
const Snarl* snarl = sr.first;
max_depth = max(max_depth, (int64_t)depth.at(snarl));
child_snarls += manager.children_of(snarl).size();
child_chains += manager.chains_of(snarl).size();
}
cout << "\t";

// Net graph info
// Internal connectivity not important, we just want the size.
auto netGraph = manager.net_graph_of(snarl, graph, false);
cout << netGraph.get_node_count() << endl;
cout << max_depth << "\t" << child_snarls << "\t" << child_chains;

cout << endl;
});

}


if (!distance_index_filename.empty()) {
//Print snarl stats from a distance index
Expand Down
4 changes: 2 additions & 2 deletions vgci/vgci.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def setUp(self):
# when moving to a more inclusive reference?
self.worse_threshold = 0.005
# /public/groups/vg/vg-data on Courtyard is served as
# https://courtyard.gi.ucsc.edu/~anovak/vg-data/
self.vg_data = 'https://courtyard.gi.ucsc.edu/~anovak/vg-data'
# https://public.gi.ucsc.edu/~anovak/vg-data/
self.vg_data = 'https://public.gi.ucsc.edu/~anovak/vg-data'
self.input_store = self.vg_data + '/bakeoff'
self.vg_docker = None
self.container = None # Use default in toil-vg, which is Docker
Expand Down

2 comments on commit 40cc426

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17341 seconds

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch v1.54.0. View the full report here.

10 tests passed, 6 tests failed and 0 tests skipped in 16811 seconds

Failed tests:

  • test_sim_chr21_snp1kg (3862 seconds)
  • test_sim_mhc_cactus (781 seconds)
  • test_sim_mhc_snp1kg (986 seconds)
  • test_sim_mhc_snp1kg_mpmap (1023 seconds)
  • test_sim_chr21_snp1kg_trained (2692 seconds)
  • test_sim_yeast_cactus (1444 seconds)

Please sign in to comment.