diff --git a/.gitignore b/.gitignore index 4ecb3b22ce..8fa097362e 100755 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,7 @@ anvio/data/misc/SCG_TAXONOMY/GTDB/SCG_SEARCH_DATABASES/*.dmnd anvio/tests/sandbox/update-contigs-and-bams-for-mini-test/output anvio/tests/sandbox/test_visualize_split_coverages/TEST_OUTDIR anvio/data/misc/KEGG/ +anvio/data/misc/PROSTT5/ anvio/data/misc/TRNA_TAXONOMY/ +anvio/data/interactive/node_modules +anvio/data/interactive/package-lock.json \ No newline at end of file diff --git a/anvio/__init__.py b/anvio/__init__.py index 5790b74d80..dcb6dbfdd0 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -3687,7 +3687,46 @@ def TABULATE(table, header, numalign="right", max_width=0): "a comma-separated list. The default stats are 'detection' and " "'mean_coverage_Q2Q3'. To see a list of available stats, use this flag " "and provide an absolutely ridiculous string after it (we suggest 'cattywampus', but you do you)."} - ) + ), + 'pan-mode': ( + ['--pan-mode'], + {'default': constants.pangenome_mode_default, + 'choices': constants.pangenome_modes_available, + 'type': str, + 'help': f"Computation mode for the pangenome. Default mode is '{constants.pangenome_mode_default}'."} + ), + 'prostt5-data-dir': ( + ['--prostt5-data-dir'], + {'default': None, + 'type': str, + 'metavar': 'PATH', + 'help': "The path for the PROSTT5 Weights to be stored. " + "If you leave it as is without specifying anything, anvi'o will set up everything in " + "a pre-defined default directory. The advantage of using " + "the default directory at the time of set up is that every user of anvi'o on a computer " + "system will be using a single data directory, but then you may need to run the setup " + "program with superuser privileges. If you don't have superuser privileges, then you can " + "use this parameter to tell anvi'o the location you wish to use to setup your weights."} + ), + 'foldseek-search-results': ( + ['--foldseek-search-results'], + {'default': None, + 'type': str, + 'metavar': 'FILE PATH', + 'help': "File path for foldseek search results generated by anvi'o during a previous run. When " + "a file path is provided, anvi'o will try to utilize that output file rather than " + "re-running foldseek. This is mostly for debugging purposes and we strongly advice " + "you to not use it unless you consider yourself to be a hacker-type."} + ), + 'init-pan-mode': ( + ['--init-pan-mode'], + {'default': constants.pangenome_mode_default, + 'choices': constants.pangenome_modes_available, + 'type': str, + 'help': "Specify the structure type to include additional data tables for the Structural Pangenome summary. " + "This option allows you to enhance the analysis by integrating specific structural data, " + "which can provide deeper insights into the pangenomic relationships among the samples."} + ), } # two functions that works with the dictionary above. diff --git a/anvio/bottleroutes.py b/anvio/bottleroutes.py index e4e95ddb03..f3c91c84dc 100644 --- a/anvio/bottleroutes.py +++ b/anvio/bottleroutes.py @@ -196,6 +196,8 @@ def register_routes(self): self.route('/data/get_gene_info/', callback=self.get_gene_info) self.route('/data/get_metabolism', callback=self.get_metabolism) self.route('/data/get_scale_bar', callback=self.get_scale_bar, method='POST') + self.route('/data/get_psgc_data/', callback=self.get_psgc_data) + self.route('/data/get_psgc_type_data/', callback=self.get_psgc_type_data) def run_application(self, ip, port): @@ -1434,6 +1436,55 @@ def get_additional_gc_data(self, gc_id, gc_key): except: return json.dumps({'status': 1}) + + def get_psgc_data(self, psgc_name): + """Gets PSGC data for the inspection page""" + + try: + psgc_data = { + psgc_name: [] + } + + associated_gcs = [] + for gc_id, psgc_id in self.interactive.gc_psgc_associations.items(): + if psgc_id == psgc_name: + associated_gcs.append(gc_id) + + for gene_id, gene_entries in self.interactive.gc_tracker.items(): + for gene_data in gene_entries: + gc_id = gene_data['gene_cluster_id'] + + # Skip if this gene's cluster is not associated with our PSGC + # Its important to do this before we add the gene info to psgc_data + if gc_id not in associated_gcs: + continue + + gene_info = { + 'gene_callers_id': gene_id, + 'gene_cluster_id': gc_id, + 'genome_name': gene_data['genome_name'], + 'alignment_summary': gene_data['alignment_summary'] + } + psgc_data[psgc_name].append(gene_info) + + return json.dumps({'status': 0, 'data': psgc_data}) + + except Exception as e: + return json.dumps({'status': 1, 'message': f"Error getting PSGC data: {str(e)}"}) + + + def get_psgc_type_data(self, psgc_name): + """Gets GC type classification data for PSGC (core/singleton/accessory)""" + try: + gc_types = {} + if psgc_name in self.interactive.items_additional_data_dict: + gc_types = json.loads(self.interactive.items_additional_data_dict[psgc_name].get('gc_types', '{}')) + + return json.dumps({'status': 0, 'data': {psgc_name: gc_types}}) + except Exception as e: + return json.dumps({'status': 1, 'message': f"Error getting PSGC type data: {str(e)}"}) + + def reroot_tree(self): # Get the Newick tree string from the form data newick = request.forms.get('newick') diff --git a/anvio/constants.py b/anvio/constants.py index 139fe0f321..aa8eef75c0 100644 --- a/anvio/constants.py +++ b/anvio/constants.py @@ -32,12 +32,23 @@ # and other purposes IS_ESSENTIAL_FIELD = lambda f: (not f.startswith('__')) and (f not in ["contig", "GC_content", "length"]) +# MODELLER / PDF file paths default_pdb_database_path = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/PDB.db') default_modeller_database_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/db') default_modeller_scripts_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/scripts') +# Interacdome data path default_interacdome_data_path = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/Interacdome') +# ProstT5 data files +default_prostt5_weight_path = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/PROSTT5/weights') + +# Pangenome mode trackers +PAN_STRUCTURE_MODE = 'structure-informed' +PAN_SEQUENCE_MODE = 'sequence-based' +pangenome_modes_available = [PAN_SEQUENCE_MODE, PAN_STRUCTURE_MODE] +pangenome_mode_default = PAN_SEQUENCE_MODE + clustering_configs_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/clusterconfigs') clustering_configs = {} diff --git a/anvio/data/interactive/css/geneclusters.css b/anvio/data/interactive/css/geneclusters.css index 1b1ad97caa..4a9caebe8a 100755 --- a/anvio/data/interactive/css/geneclusters.css +++ b/anvio/data/interactive/css/geneclusters.css @@ -1,4 +1,5 @@ @import url('normalize.css'); +@import url('variable.css'); #chart-container { width: 1200px; @@ -227,6 +228,48 @@ td { padding-bottom: 10px; } +.type-core { + fill: var(--success-600); +} + +.type-accessory { + fill: var(--warning-600); +} + +.type-singleton { + fill: var(--info-600); +} + +#display-search-controls { + padding: 12px; + margin-left: auto; + margin-right: auto; + background-color: #f5f5f5; + border-radius: 9px; + margin-bottom: 17px; +} + +.btn-link { + background: none; + border: none; + padding: 0; + color: inherit; + font-size: 1.5rem; + cursor: pointer; +} + +.btn-link:hover { + color: var(--warning-600); +} + +.display-text { + font-size: 1rem; +} + +#info-icon { + font-size: 1.2rem; +} + @media (min-width: 768px) { .gc-container { padding: 0 2rem; diff --git a/anvio/data/interactive/geneclusters.html b/anvio/data/interactive/geneclusters.html index be55cd8855..703ad6d03f 100644 --- a/anvio/data/interactive/geneclusters.html +++ b/anvio/data/interactive/geneclusters.html @@ -10,6 +10,7 @@ + @@ -40,19 +41,45 @@

- Wrap: - - -   - Font Size: - - - + Wrap: + + + +   + Font Size: + + + + +
+
+ +
@@ -80,5 +107,30 @@

$('#' + id).val(Math.max(1,parseInt($('#' + id).val()) + delta)); } + + + diff --git a/anvio/data/interactive/js/bin.js b/anvio/data/interactive/js/bin.js index 097dc13aeb..db699f0ecd 100644 --- a/anvio/data/interactive/js/bin.js +++ b/anvio/data/interactive/js/bin.js @@ -76,24 +76,24 @@ Bins.prototype.NewBin = function(id, binState) { var template = ` - + - ${mode != 'pan' ? ` + ${mode !== 'pan' && mode !== 'structure' ? ` ` : ''} - ${mode == 'pan' ? ` + ${mode === 'pan' || mode === 'structure' ? ` ` : ` - + `} - ${ mode === 'full' || mode === 'refine' || mode === 'manual' || mode === 'pan' ? ` + ${ mode === 'full' || mode === 'refine' || mode === 'manual' || mode === 'pan' || mode === 'structure' ? ``; + if (typeof col2 === 'string') { + try { + var parsedCol2 = JSON.parse(col2); + + if (Array.isArray(parsedCol2)) { + col2 = parsedCol2.map(item => `Item: ${item}`).join('
'); + } else if (typeof parsedCol2 === 'object') { + col2 = Object.entries(parsedCol2).map(([key, value]) => `${key}: ${value}`).join('
'); + } else { + col2 = `Value: ${parsedCol2}`; + } + } catch (e) { + console.log(e) + } + } + + rows = rows + ``; } $("#tbody_search_body").html(rows); } diff --git a/anvio/data/interactive/js/utils.js b/anvio/data/interactive/js/utils.js index 14e52891f1..a41d968908 100644 --- a/anvio/data/interactive/js/utils.js +++ b/anvio/data/interactive/js/utils.js @@ -394,7 +394,7 @@ function showGeneClusterFunctionsSummaryTableDialog(title, content)

Please note that this is just a quick view of the functions associated with your gene clusters. A much more appropriate way to summarize this information and more is to use the program anvi-summarize, and inspect the resulting TAB-delimited output file

-
${contig_length}
    @@ -470,7 +470,7 @@ Bins.prototype.UpdateBinsWindow = function(bin_list) { for (let i = 0; i < bin_list.length; i++) { let bin_id = bin_list[i]; - if (mode == 'pan') { + if (mode === 'pan' || mode === 'structure') { let num_gene_clusters = 0; let num_gene_calls = 0; diff --git a/anvio/data/interactive/js/color-coding.js b/anvio/data/interactive/js/color-coding.js index e353ec06de..fb38f128ba 100644 --- a/anvio/data/interactive/js/color-coding.js +++ b/anvio/data/interactive/js/color-coding.js @@ -329,6 +329,7 @@ function initializeCheckBoxes(){ var all = document.createElement("button"); all.innerHTML = "Check All"; + all.className = "btn btn-outline-secondary btn-sm"; all.style = "margin-right:20px;" all.onclick = (function() { for (i = 0; i < labels.length; i++){ @@ -341,6 +342,7 @@ function initializeCheckBoxes(){ var none = document.createElement("button") none.innerHTML = "Uncheck All" + none.className = "btn btn-outline-secondary btn-sm"; none.onclick = (function() { for (i = 0; i < labels.length; i++){ var letter = String(labels[i]); diff --git a/anvio/data/interactive/js/constants.js b/anvio/data/interactive/js/constants.js index 00860c5f12..96ced32b8d 100644 --- a/anvio/data/interactive/js/constants.js +++ b/anvio/data/interactive/js/constants.js @@ -534,6 +534,48 @@ var named_layers = { 'norm': 'none', 'pretty_name': '_', }, + 'gene_types!core': { + 'height': 200, + 'color': '#440000', + 'norm': 'none', + 'min': 0, + 'max': 100, + 'max_disabled': false, + 'min_disabled': false, + 'pretty_name': 'Gene Types', + 'type': 'bar' + }, + 'gene_types!singleton': { + 'height': 200, + 'color': '#440000', + 'norm': 'none', + 'min': 0, + 'max': 100, + 'max_disabled': false, + 'min_disabled': false, + 'pretty_name': 'Gene Types', + 'type': 'bar' + }, + 'gene_types!accessory': { + 'height': 200, + 'color': '#440000', + 'norm': 'none', + 'min': 0, + 'max': 100, + 'max_disabled': false, + 'min_disabled': false, + 'pretty_name': 'Gene Types', + 'type': 'bar' + }, + 'psgc_composition': { + 'type': 'bar', + 'pretty_name': 'GCs in PSGC' + }, + 'number_gc_in_psgc': { + 'height': 200, + 'color': '#ffa60d', + 'pretty_name': 'Num GC in PSGC' + }, }; named_category_colors = { @@ -577,6 +619,12 @@ pretty_names = { }; function getPrettyLayerTitle(layer_title) { + if (layer_title.indexOf('!') > -1 ) + { + layer_title = layer_title.split('!')[0]; + } + + if (layer_title in named_layers && 'pretty_name' in named_layers[layer_title]) { layer_title = named_layers[layer_title]['pretty_name']; } else if(layer_title.substring(0, 5) == "hmmx_") { @@ -587,11 +635,6 @@ function getPrettyLayerTitle(layer_title) { layer_title = layer_title.replace(/_/g, " "); } - if (layer_title.indexOf('!') > -1 ) - { - layer_title = layer_title.split('!')[0]; - } - return layer_title; } diff --git a/anvio/data/interactive/js/drawer.js b/anvio/data/interactive/js/drawer.js index 27dfa3f780..5986952583 100644 --- a/anvio/data/interactive/js/drawer.js +++ b/anvio/data/interactive/js/drawer.js @@ -262,7 +262,7 @@ Drawer.prototype.generate_tooltips = function() { let stack_names = layerdata[0][pindex].split('!')[1].split(';'); let stack_items = layerdata[index][pindex].split(';'); let layer_title = layerdata[0][pindex]; - message = '' + layer_title.split('!')[0] + ''; + message = '
' + getPrettyName(layer_title.split('!')[0]) + ''; for (let j = stack_names.length - 1; j >= 0; j--) { let bar_name = stack_names[j]; message += ``; @@ -274,7 +274,7 @@ Drawer.prototype.generate_tooltips = function() { else { var layer_title = layerdata[0][pindex]; - title.push(''); + title.push(''); } } diff --git a/anvio/data/interactive/js/geneclusters.js b/anvio/data/interactive/js/geneclusters.js index 2bff1888d1..001f244ff4 100644 --- a/anvio/data/interactive/js/geneclusters.js +++ b/anvio/data/interactive/js/geneclusters.js @@ -32,7 +32,9 @@ var total; var state; var gene_cluster_data; - +var psgc_data = null; +var mode = null; +var filter_genome = []; function loadAll() { $.ajaxPrefilter(function(options) { @@ -86,7 +88,8 @@ function loadAll() { if(previous_gene_cluster_name) prev_str = '<<< prev | '; - document.getElementById("header").innerHTML = "" + gene_cluster_name + " with " + gene_caller_ids.length + " genes detailed
" + prev_str + position + next_str + ""; + const headerContent = `${gene_cluster_name} with ${gene_caller_ids.length} genes detailed
${prev_str} ${position} ${next_str}`; + document.getElementById("header").innerHTML = headerContent; if (typeof localStorage.state === 'undefined') { @@ -110,7 +113,7 @@ async function loadGCAdditionalData(gc_id, gc_key, gc_key_short) { }); if (response['status'] === 0) { var newThHeader = $('' + + var template = '' + '' + '' + '' + '' + '
${bar_name}
${stack_items[j]}
' + layer_title + '' + layerdata[index][pindex] + '' + getPrettyName(layer_title) + '' + layerdata[index][pindex] + '').text(gc_key_short); - var newThData = $('').text((response.gene_cluster_data).toFixed(2)); + var newThData = $('').text(gc_key === 'num_genes_in_gene_cluster' || gc_key === 'num_genomes_gene_cluster_has_hits' ? Math.round(response.gene_cluster_data): (response.gene_cluster_data).toFixed(2)); var gc_title_list = { combined_homogeneity_index: "Combined Homogeneity Index", @@ -148,6 +151,17 @@ async function createDisplay(display_table){ var svg = document.getElementById('svg'); var table = document.getElementById('gc-acc-main'); + try { + psgc_data =await loadPSGCData(gene_cluster_data.gene_cluster_name); + + if (psgc_data) { + mode = 'structure'; + gc_type_in_psgc = await loadGCTypeInPSGCData(gene_cluster_data.gene_cluster_name); + } + } catch (error) { + console.error('Error loading PSGC data:', error); + } + // clear content while (svg.firstChild) { svg.removeChild(svg.firstChild); @@ -202,19 +216,18 @@ async function createDisplay(display_table){ } var coded_positions = determineColor(all_positions); - while (true) - { - ignoreTable = true; + // Ensure this loop is only called once + if (!window.hasRunLayerLoop) { var fragment = document.createDocumentFragment(); - for (var layer_id = 0; layer_id < state['layer-order'].length; layer_id++) - { + for (var layer_id = 0; layer_id < state['layer-order'].length; layer_id++) { var layer = state['layer-order'][layer_id]; if (state['layers'][layer]['height'] == 0) continue; - if (gene_cluster_data.genomes.indexOf(layer) === -1) + if (gene_cluster_data.genomes.indexOf(layer) === -1 || (filter_genome.length > 0 && !filter_genome.includes(layer) && !filter_genome.some(term => layer.includes(term)))) { continue; + } var rect = document.createElementNS('http://www.w3.org/2000/svg', 'rect'); rect.setAttribute('x', 0); @@ -235,8 +248,14 @@ async function createDisplay(display_table){ text.appendChild(document.createTextNode(layer)); fragment.appendChild(text); - sub_y_cord = y_cord + 5; - gene_cluster_data.gene_caller_ids_in_genomes[layer].forEach(function (caller_id) { + if (navigator.userAgent.indexOf("Firefox") !== -1) { + // Firefox uses a different baseline for text + sub_y_cord = y_cord + (sequence_font_size * 1.5); + } else { + sub_y_cord = y_cord + 5; + } + + gene_cluster_data.gene_caller_ids_in_genomes[layer].forEach(function (caller_id) { sequence = gene_cluster_data.aa_sequences_in_gene_cluster[layer][caller_id]; var text = document.createElementNS('http://www.w3.org/2000/svg', 'text'); text.setAttribute('x', 0); @@ -259,46 +278,105 @@ async function createDisplay(display_table){ text.appendChild(document.createTextNode(caller_id)); fragment.appendChild(text); + if (mode === 'structure') { + let gc_id = ''; + + if (psgc_data && Object.keys(psgc_data).length > 0) { + for (var psgc_id in psgc_data) { + var matchingGene = psgc_data[psgc_id].find(gene => + gene.gene_callers_id === caller_id + ); + + if (matchingGene) { + gc_id = matchingGene.gene_cluster_id; + break; + } + } + } + + // Get the type indicator [A] || [C] || [S] if type data is available + let type_indicator = ''; + let type_class = ''; + if (gc_type_in_psgc) { + for (let psgc_id in gc_type_in_psgc) { + const type = gc_type_in_psgc[psgc_id][gc_id]; + + if (type === 'core') { + type_indicator = 'C'; + type_class = 'type-core'; + } + else if (type === 'accessory') { + type_indicator = 'A'; + type_class = 'type-accessory'; + } + else if (type === 'singleton') { + type_indicator = 'S'; + type_class = 'type-singleton'; + } + + break; + } + } + + var text = document.createElementNS('http://www.w3.org/2000/svg', 'text'); + text.setAttribute('x', 0); + text.setAttribute('y', sub_y_cord); + text.setAttribute('font-size', sequence_font_size); + text.setAttribute('style', 'alignment-baseline:text-before-edge; cursor: pointer;'); + text.setAttribute('class', 'callerTitle gcName'); + text.setAttribute('gc-caller-id', gc_id); + text.setAttribute('genome-name', layer); + text.setAttribute('data-toggle', 'popover'); + text.onclick = function(event) { + var obj = event.target; + if (!obj.getAttribute('data-content')) { + obj.setAttribute('data-content', get_gene_functions_table_html_for_structure(psgc_data, gc_id) + ''); + } + }; + + text.appendChild(document.createTextNode(gc_id)); + + if (type_indicator) { + var typeSpan = document.createElementNS('http://www.w3.org/2000/svg', 'tspan'); + typeSpan.setAttribute('class', type_class); + typeSpan.setAttribute('style', 'alignment-baseline:text-before-edge; cursor: text;'); + typeSpan.setAttribute('dy', '0'); + typeSpan.appendChild(document.createTextNode(' [' + type_indicator + ']')); + text.appendChild(typeSpan); + } + fragment.appendChild(text); + } + var text = document.createElementNS('http://www.w3.org/2000/svg', 'text'); text.setAttribute('x', 0); text.setAttribute('y', sub_y_cord); text.setAttribute('font-size', sequence_font_size); - text.setAttribute('font-family',"monospace"); + text.setAttribute('font-family', "monospace"); text.setAttribute('font-weight', '100'); text.setAttribute('style', 'alignment-baseline:text-before-edge'); text.setAttribute('class', 'sequence'); _sequence = sequence.substr(offset, sequence_wrap); - for (var _letter_index=0; _letter_index < _sequence.length; _letter_index++) { - var tspan = document.createElementNS('http://www.w3.org/2000/svg', 'tspan'); - var index = _letter_index+offset; + var tspanContent = ''; + for (var _letter_index = 0; _letter_index < _sequence.length; _letter_index++) { + var index = _letter_index + offset; var num = order[caller_id]; var acid = _sequence[_letter_index]; var dict = coded_positions[index][num]; - tspan.setAttribute('fill', dict[acid]); - tspan.style.fontWeight = 'bold'; - tspan.appendChild(document.createTextNode(acid)); - tspan.setAttribute('style', 'alignment-baseline:text-before-edge'); - text.appendChild(tspan); - } + tspanContent += `${acid}`; + } + text.innerHTML = tspanContent; fragment.appendChild(text); sub_y_cord = sub_y_cord + sequence_font_size * 1.5; - if (offset < sequence.length) { - ignoreTable = false; - } }); y_cord = y_cord + parseFloat(rect.getAttribute('height')) + 5; } - if (ignoreTable) { - break; - } else { - svg.appendChild(fragment); - } + svg.appendChild(fragment); offset += sequence_wrap; y_cord = y_cord + 15; } @@ -336,24 +414,42 @@ function calculateLayout() { $('.callerTitle').attr('x', max_genome_title_width + (HORIZONTAL_PADDING * 2)); var max_caller_width = 0; - $('.callerTitle').each(function(i, d) { + $('.callerTitle:not(.gcName)').each(function(i, d) { dwidth = d.getBBox().width; if (dwidth > max_caller_width) { max_caller_width = dwidth; }; }); - $('.sequence').attr('x', max_genome_title_width + max_caller_width + (HORIZONTAL_PADDING * 3)); + var max_gc_name_width = 0; + var sequence_start_x; - var max_sequence_width = 0; - $('.sequence').each(function(i, d) { - dwidth = d.getBBox().width; - if (dwidth > max_sequence_width) { - max_sequence_width = dwidth; - }; - }); + if (mode === 'structure') { + $('.gcName').attr('x', max_genome_title_width + max_caller_width + (HORIZONTAL_PADDING * 3)); + $('.gcName').each(function(i, d) { + dwidth = d.getBBox().width; + if (dwidth > max_gc_name_width) { + max_gc_name_width = dwidth; + }; + }); + sequence_start_x = max_genome_title_width + max_caller_width + max_gc_name_width + (HORIZONTAL_PADDING * 4); + $('.sequence').attr('x', sequence_start_x); + } else { + sequence_start_x = max_genome_title_width + max_caller_width + (HORIZONTAL_PADDING * 3); + $('.sequence').attr('x', sequence_start_x); + } + + var wrap_length = parseInt($('#wrap_length').val()); + var font_size = parseInt($('#font_size').val()); - $('.sequenceBackground').attr('width', max_caller_width + max_sequence_width + (HORIZONTAL_PADDING * 3)); + var max_sequence_width = wrap_length * (font_size * 0.6); + + var total_width = max_caller_width + max_sequence_width; + if (mode === 'structure') { + total_width += max_gc_name_width + HORIZONTAL_PADDING; + } + + $('.sequenceBackground').attr('width', total_width + (HORIZONTAL_PADDING * 3)); bbox = svg.getBBox(); svg.setAttribute('width', bbox.width); @@ -381,3 +477,103 @@ function scrollTableRight() { behavior: 'smooth' }); } + +async function loadPSGCData(psgc_name) { + try { + var response = await $.ajax({ + type: 'GET', + cache: false, + url: '/data/get_psgc_data/' + psgc_name + }); + + if (typeof response === 'string') { + response = JSON.parse(response); + } + + if (response.status === 0 && response.data) { + return response.data; + } else { + console.error('Error response:', response.message); + return null; + } + } catch (error) { + console.error('Error in loadPSGCData:', error); + } +} + +async function loadGCTypeInPSGCData(psgc_name) { + try { + const response = await $.ajax({ + type: 'GET', + cache: false, + url: '/data/get_psgc_type_data/' + psgc_name + }); + + if (response.status === 0 && response.data) { + return response.data; + } else { + console.error('Error response from type data:', response.message); + return null; + } + } catch (error) { + console.error('Error in loadPSGCTypeData:', error); + return null; + } +} + +function get_gene_functions_table_html_for_structure(psgc_data, selected_gc_id) { + for (const psgc_id in psgc_data) { + var genes = psgc_data[psgc_id].filter(gene => + gene.gene_cluster_id === selected_gc_id + ); + + if (genes && genes.length > 0) { + let functions_table_html = ''; + + let gc_type = 'Unknown'; + if (gc_type_in_psgc) { + for (let psgc_id in gc_type_in_psgc) { + if (gc_type_in_psgc[psgc_id][selected_gc_id]) { + gc_type = gc_type_in_psgc[psgc_id][selected_gc_id].charAt(0).toUpperCase() + gc_type_in_psgc[psgc_id][selected_gc_id].slice(1); + break; + } + } + } + + functions_table_html += '

Gene Cluster Information

'; + functions_table_html += ''; + functions_table_html += ''; + functions_table_html += ''; + functions_table_html += ''; + functions_table_html += '
Gene Cluster ID' + selected_gc_id + '
Gene Cluster Type' + gc_type + '
'; + + functions_table_html += '

Genes in this cluster

'; + functions_table_html += '
'; + functions_table_html += ''; + functions_table_html += '' + '' + '' + '' + ''; + functions_table_html += ''; + + genes.forEach(gene => { + functions_table_html += '' + + '' + + '' + + '' + + ''; + }); + + functions_table_html += '
Gene Caller IDGenome NameAlignment Summary
' + gene.gene_callers_id + '' + gene.genome_name + '
' + + (gene.alignment_summary || 'No alignment data') + '
'; + functions_table_html += '
'; + + return functions_table_html; + } + } + + return '

No gene information available

'; +} + +function filterGenome() { + const input = $('#filter-genome').val(); + filter_genome = input.split(',').map(item => item.trim()).filter(item => item); + createDisplay(false); +} \ No newline at end of file diff --git a/anvio/data/interactive/js/main.js b/anvio/data/interactive/js/main.js index 18ba5ba56c..98d11fb2b4 100644 --- a/anvio/data/interactive/js/main.js +++ b/anvio/data/interactive/js/main.js @@ -89,8 +89,12 @@ var request_prefix = getParameterByName('request_prefix'); $(window).resize(function() { // get current client size - VIEWER_WIDTH = document.getElementById('svg').clientWidth || document.getElementById('svg').width.baseVal.value; - VIEWER_HEIGHT = document.getElementById('svg').clientHeight || document.getElementById('svg').height.baseVal.value; + var svgElement = document.getElementById('svg'); + + if (svgElement) { + VIEWER_WIDTH = svgElement.clientWidth || (svgElement.width && svgElement.width.baseVal.value) || 0; + VIEWER_HEIGHT = svgElement.clientHeight || (svgElement.height && svgElement.height.baseVal.value) || 0; + } }); $(document).ready(function() { @@ -1330,13 +1334,13 @@ function buildLayersTable(order, settings) var norm = (mode == 'full') ? 'log' : 'none'; } - var template = '
{short-name}n/an/a' + - ' ' + ' ' + ' ' + ' ' + @@ -1986,12 +1990,72 @@ function showCompleteness(bin_id, updateOnly) { } +function getGCInPSGCInformation(gene_cluster_name) { + return $.ajax({ + type: 'GET', + cache: false, + url: '/data/get_psgc_type_data/' + gene_cluster_name + }).then(function(psgc_response) { + if (psgc_response && psgc_response.data) { + // Check if the response is empty + const hasValidData = Object.keys(psgc_response.data).some(key => { + return Object.keys(psgc_response.data[key]).length > 0; + }); + if (hasValidData) { + mode = 'structure'; + return psgc_response.data; + } + } + }); +} + + +function formatGenericData(data) { + if(mode === 'structure'){ + let formattedString = ` +
+
+ + + + + + + + + `; + + for (const [psgcId, value] of Object.entries(data)) { + if (typeof value === 'object' && value !== null) { + for (const [geneClusterId, type] of Object.entries(value)) { + formattedString += ` + + + + + `; + } + } + } + + formattedString += ` + +
PSGC IDGene ClusterType
${psgcId}${geneClusterId}${type}
+
+
`; + + return formattedString; + } +} + function showGeneClusterDetails(bin_id, updateOnly) { if (typeof updateOnly === 'undefined') updateOnly = false; var title = 'Gene clusters in "' + $('#bin_name_' + bin_id).val() + '"'; + var temp_mode = mode; + if (updateOnly && !checkObjectExists('#modal' + title.hashCode())) return; @@ -2010,17 +2074,19 @@ function showGeneClusterDetails(bin_id, updateOnly) { return; } - let content = ` - - - - - - - - - - `; + let content = ` +
+
+
Gene clusterSourceAccessionFunction
+ + + + + + + + + `; // building the table for each gene cluster Object.keys(response['functions']).map(function(gene_cluster_name) { @@ -2030,33 +2096,59 @@ function showGeneClusterDetails(bin_id, updateOnly) { Object.keys(response['sources']).map(function(index) { let function_source = response['sources'][index]; - accession_string = getPrettyFunctionsString(d[function_source]['accession'], function_source) - function_string = getPrettyFunctionsString(d[function_source]['function']) - + accession_string = getPrettyFunctionsString(d[function_source]['accession'], function_source); + function_string = getPrettyFunctionsString(d[function_source]['function']); if (index == 0) { content += ` - - - - + + + + `; } else { content += ` - - - + + + `; } }); }); - content += `
Gene clusterSourceAccessionFunction
${ gene_cluster_name }${ function_source }${ accession_string }${ function_string }${gene_cluster_name}${function_source} + ${accession_string} + + ${function_string} +
${ function_source }${ accession_string }${ function_string }${function_source} + ${accession_string} + + ${function_string} +
` + content += `
+ + `; + + // Fetch additional PSGC data and append it directly to content + let additionalDataPromises = Object.keys(response['functions']).map(gene_cluster_name => { + return getGCInPSGCInformation(gene_cluster_name).then(result => { + if(mode === 'structure'){ + if (!content.includes("Gene Clusters Occur in Protein Structure Informed Gene Clusters")) { + content += `
+ Gene Clusters Occur in Protein Structure Informed Gene Clusters +
`; + } + content += formatGenericData(result); + } + }); + }); + + mode = temp_mode; - showGeneClusterFunctionsSummaryTableDialog('A summary of functions for ' + bin_info['items'].length + ' gene clusters in "' + bin_info['bin_name'] + '".', content + '
'); + Promise.all(additionalDataPromises).then(() => { + showGeneClusterFunctionsSummaryTableDialog('A summary of functions for ' + bin_info['items'].length + ' gene clusters in "' + bin_info['bin_name'] + '".', content); + }); } }); - } @@ -2146,7 +2238,7 @@ async function exportSvg(dontDownload) { 'color': $('#bin_color_' + bin_id).attr('color'), }; - if (mode == 'pan') { + if (mode == 'pan' || mode === 'structure') { var geneClustersElement = $(bin).find('.num-gene-clusters'); if (geneClustersElement.length > 0) { _bin_info['gene_clusters'] = geneClustersElement.attr('data-value'); diff --git a/anvio/data/interactive/js/mouse-events.js b/anvio/data/interactive/js/mouse-events.js index 8a3e1b114f..d3f9e9365c 100644 --- a/anvio/data/interactive/js/mouse-events.js +++ b/anvio/data/interactive/js/mouse-events.js @@ -347,6 +347,17 @@ function write_mouse_table(content, item_name, layer_name, layer_id, stackbar_la $('#cell_layer_name').html(''); } + const jsonRegex = /{[^}]+}/g; + content = content.replace(jsonRegex, (match) => { + try { + const jsonString = match.replace(/'/g, '"'); + const parsedContent = JSON.parse(jsonString); + return Object.entries(parsedContent).map(([key, value]) => `${key}: ${value}`).join('
'); + } catch (e) { + return match; + } + }); + $('#tooltip_content').html(content); if ($('#tooltip_content').height() + 300 > $(window).height()) { diff --git a/anvio/data/interactive/js/search.js b/anvio/data/interactive/js/search.js index 7be23de08f..c9361171f3 100644 --- a/anvio/data/interactive/js/search.js +++ b/anvio/data/interactive/js/search.js @@ -226,7 +226,23 @@ function showSearchResult() { var col1 = search_results[i]['split']; var col2 = search_results[i]['value']; - rows = rows + `
${col1}${ col2 }
${col1}${ col2 }
- - - - - - - - {% for entry in basics_pretty|lookup:"pan" %} - - - - - {% endfor %} - -
KeyValue
{{ entry.0 }}{{ entry.1 }}
-
-
- + {% if meta|lookup:"pan"|lookup:"db_variant" == "structure-informed" %} + Please note that this is a protein structure-informed pangenome, where the gene clusters are formed based on strucutral homology between individual genes, rather than their smimlarities + at the level of amino acid sequences. For more on this, please see the documentation on anvi-pan-genome + {% else %} + Please note that this is a conventional pangenome where gene clusters are formed based on amino acid seqeunce homology. It is also possible to compute a structure pangenome with anvi'o where gene clusters + are formed based on homology information gathered from predicted protein strcutures. For more on this, please see the documentation on anvi-pan-genome. {% endif %} + -
-
-
- Genomes storage -
-
- - - - - - - - - {% for entry in basics_pretty|lookup:"genomes" %} - - - - - {% endfor %} - -
KeyValue
{{ entry.0 }}{{ entry.1 }}
-
-
+ {% if not meta|lookup:"pan"|lookup:"blank" %} +
+
+

Pan database

+
+
+ + + + + + + + + {% for entry in basics_pretty|lookup:"pan" %} + + + + + {% endfor %} + +
KeyValue
{{ entry.0 }}{{ entry.1 }}
+
+
+ {% endif %} + +
+
+

Genomes storage database

+
+
+ + + + + + + + + {% for entry in basics_pretty|lookup:"genomes" %} + + + + + {% endfor %} + +
KeyValue
{{ entry.0 }}{{ entry.1 }}
@@ -146,7 +153,7 @@
-
+

Summary files for gene clusters

@@ -171,7 +178,7 @@
-
+

Misc Data

@@ -207,7 +214,7 @@ - + @@ -219,6 +226,30 @@ $(document).ready(function() { $("body").tooltip({ selector: '[data-toggle=tooltip]' }); }); + + function toggleContent() { + const container = document.getElementById('gc-tracker-container'); + const button = document.getElementById('toggle-button'); + + if (container.classList.contains('expanded')) { + container.classList.remove('expanded'); + button.textContent = 'Show More'; + } else { + container.classList.add('expanded'); + button.textContent = 'Show Less'; + } + } + diff --git a/anvio/dbops.py b/anvio/dbops.py index ab0bad83bf..40f1e4fccc 100644 --- a/anvio/dbops.py +++ b/anvio/dbops.py @@ -1537,6 +1537,9 @@ def __init__(self, args, r=run, p=progress): self.views = {} self.collection_profile = {} + self.gc_tracker = {} + self.gc_psgc_associations = {} + # the following two are initialized via `init_items_additional_data()` and use information # stored in item additional data tables in the pan database self.items_additional_data_dict = None @@ -2242,6 +2245,52 @@ def init_gene_clusters_functions(self): self.progress.end() + def init_gc_tracker(self): + """Initializes the gc_tracker dictionary from the pan database. + + The structure of the dictionary is: + { + gene_caller_id: { + 'gene_cluster_id': str, + 'genome_name': str, + 'alignment_summary': str + } + } + """ + pan_db = PanDatabase(self.pan_db_path) + + gc_tracker_data = pan_db.db.get_table_as_dict('gc_tracker') + + gene_entries = {} + for entry in gc_tracker_data.values(): + gene_caller_id = entry['gene_caller_id'] + if gene_caller_id not in gene_entries: + gene_entries[gene_caller_id] = [] + gene_entries[gene_caller_id].append({'gene_cluster_id': entry['gene_cluster_id'],'genome_name': entry['genome_name'],'alignment_summary': entry['alignment_summary']}) + + self.gc_tracker = gene_entries + + pan_db.disconnect() + + + def init_gc_psgc_associations(self): + """Initializes the gc_psgc_associations dictionary from the pan database. + + The structure of the dictionary is: + { + gene_cluster_id: protein_structure_informed_gene_cluster_id + } + """ + pan_db = PanDatabase(self.pan_db_path) + + associations_data = pan_db.db.get_table_as_dict('gc_psgc_associations') + + for gc_id, entry in associations_data.items(): + self.gc_psgc_associations[gc_id] = entry['protein_structure_informed_gene_cluster_id'] + + pan_db.disconnect() + + def init_items_additional_data(self): """Recover additional data stored in the pan database.""" @@ -4206,8 +4255,9 @@ class PanDatabase: """To create an empty pan database, and/or access to one.""" def __init__(self, db_path, run=run, progress=progress, quiet=True): self.db = None - self.db_path = db_path self.db_type = 'pan' + self.db_variant = None + self.db_path = db_path self.run = run self.progress = progress @@ -4240,15 +4290,16 @@ def init(self): self.internal_genomes = [s.strip() for s in self.meta['internal_genome_names'].split(',')] self.external_genomes = [s.strip() for s in self.meta['external_genome_names'].split(',')] self.genomes = self.internal_genomes + self.external_genomes + self.db_variant = self.meta['db_variant'] # open the database self.db = db.DB(self.db_path, anvio.__pan__version__) - self.run.info('Pan database', 'An existing database, %s, has been initiated.' % self.db_path, quiet=self.quiet) - self.run.info('Genomes', '%d found' % len(self.genomes), quiet=self.quiet) + self.run.info(f'Pan database ({self.db_variant})', f'An existing database, {self.db_path}, has been initiated.', quiet=self.quiet) + self.run.info('Genomes', f'{len(self.genomes)} found', quiet=self.quiet) - def touch(self): + def touch(self, db_variant=constants.pangenome_mode_default): is_db_ok_to_create(self.db_path, self.db_type) self.db = db.DB(self.db_path, anvio.__pan__version__, new_database=True) @@ -4259,6 +4310,11 @@ def touch(self): self.db.create_table(t.pan_reaction_network_metabolites_table_name, t.pan_reaction_network_metabolites_table_structure, t.pan_reaction_network_metabolites_table_types) self.db.create_table(t.pan_reaction_network_kegg_table_name, t.pan_reaction_network_kegg_table_structure, t.pan_reaction_network_kegg_table_types) + # these are the ones only necessary when the db_variant is structure-informed + if db_variant == constants.PAN_STRUCTURE_MODE: + self.db.create_table(t.pan_gc_psgc_associations_table_name, t.pan_gc_psgc_associations_table_structure, t.pan_gc_psgc_associations_table_types) + self.db.create_table(t.pan_gc_tracker_table_name, t.pan_gc_tracker_table_structure, t.pan_gc_tracker_table_types) + # creating empty default tables for standard anvi'o pan dbs self.db.create_table(t.item_additional_data_table_name, t.item_additional_data_table_structure, t.item_additional_data_table_types) self.db.create_table(t.item_orders_table_name, t.item_orders_table_structure, t.item_orders_table_types) @@ -4275,8 +4331,8 @@ def touch(self): return self.db - def create(self, meta_values={}): - self.touch() + def create(self, meta_values={}, db_variant=constants.pangenome_mode_default): + self.touch(db_variant=db_variant) for key in meta_values: self.db.set_meta_value(key, meta_values[key]) @@ -4285,10 +4341,11 @@ def create(self, meta_values={}): # know thyself self.db.set_meta_value('db_type', 'pan') + self.db.set_meta_value('db_variant', db_variant) self.disconnect() - self.run.info('Pan database', 'A new database, %s, has been created.' % (self.db_path), quiet=self.quiet) + self.run.info(f'Pan database ({db_variant})', 'A new database, %s, has been created.' % (self.db_path), quiet=self.quiet) def disconnect(self): diff --git a/anvio/docs/artifacts/prostt5-model-data.md b/anvio/docs/artifacts/prostt5-model-data.md new file mode 100644 index 0000000000..fa33cd65fb --- /dev/null +++ b/anvio/docs/artifacts/prostt5-model-data.md @@ -0,0 +1,5 @@ +This artifact stores the data downloaded by %(anvi-setup-prostt5)s and is essential for running %(anvi-pan-genome)s. It includes the ProstT5 model weights, which are required by Foldseek to efficiently perform searches for protein structural similarities. + +As detailed in the Foldseek documentation, this data consists of the pre-trained ProstT5 model that accelerates protein structure search tasks. The ProstT5 model is crucial for ensuring accurate results when using Foldseek in your anvi'o workflows. + +By default, the ProstT5 model weights are stored in anvio/data/misc/PROSTT5/weights, but users can specify a custom path during setup by using the --prostt5-weight-dir parameter in %(anvi-setup-foldseek)s. \ No newline at end of file diff --git a/anvio/docs/programs/anvi-setup-prostt5.md b/anvio/docs/programs/anvi-setup-prostt5.md new file mode 100644 index 0000000000..6e7cb6b7cd --- /dev/null +++ b/anvio/docs/programs/anvi-setup-prostt5.md @@ -0,0 +1,25 @@ + +This program, like other anvi-setup commands, prepares your environment by downloading and configuring the ProstT5 model required for %(anvi-pan-genome --pan-mode structure-informed)s. ProstT5 is essential for Foldseek to perform efficient and accurate searches for protein structural similarities. You only need to run this setup once. + +By executing this command, the necessary ProstT5 model weights will be downloaded and stored in the %(foldseek-model-data)s artifact, ensuring that Foldseek can function optimally in your anvi'o workflows. + +Setting up the ProstT5 model is simple: + +{{ codestart }} +anvi-setup-prostt5 +{{ codestop }} + +When running this program, you can provide a path to store your ProstT5 model in. The default path is `anvio/data/misc/PROSTT5/weights`; if you use a custom path, you will have to provide it to %(anvi-pan-genome)s with the same parameter. Here is an example run: + + +{{ codestart }} +anvi-setup-prostt5 --prostt5-weight-dir path/to/directory +{{ codestop }} + +If you want to overwrite any data that you have already downloaded (for example if you suspect something went wrong in the download), add the `--reset` flag: + +{{ codestart }} +anvi-setup-prostt5 --prostt5-weight-dir path/to/directory \ + --reset +{{ codestop }} + diff --git a/anvio/drivers/foldseek.py b/anvio/drivers/foldseek.py new file mode 100644 index 0000000000..9b2f5cc2b7 --- /dev/null +++ b/anvio/drivers/foldseek.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +# coding: utf-8 +""" Foldseek Driver """ + +import os +import shutil +import argparse +import tempfile + +import anvio +import anvio.fastalib as f +import anvio.utils as utils +import anvio.terminal as terminal +import anvio.filesnpaths as filesnpaths +import anvio.constants as constants + +from collections import defaultdict +from anvio.drivers.mcl import MCL +from anvio.errors import ConfigError, FilesNPathsError +from anvio.filesnpaths import AppendableFile + + +__copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__maintainer__ = "Metehan Sever" +__email__ = "metehaansever@gmail.com" + + +run = terminal.Run() +progress = terminal.Progress() +pp = terminal.pretty_print + +class Foldseek(): + + def __init__(self, query_fasta=None, run=run, progress=progress, num_threads=1, weight_dir=None, overwrite_output_destinations=False): + self.run = run + self.progress = progress + + utils.is_program_exists('foldseek') + + self.query_fasta = query_fasta + self.num_threads = num_threads + self.weight_dir = weight_dir or constants.default_prostt5_weight_path + self.overwrite_output_destinations = overwrite_output_destinations + self.tmp_dir = tempfile.gettempdir() + + try: + filesnpaths.is_file_exists(self.weight_dir) + except FilesNPathsError: + run.warning("Anvi'o requires to have ProstT5 to run --pan-mode structure." + " You can easily download that with the command down below.", + header="⚠️ YOUR ATTENTION PLEASE ⚠️", overwrite_verbose=True, lc='yellow') + run.info_single("anvi-setup-prostt5", level=0, overwrite_verbose=True) + raise ConfigError("It seems like you forgot to download the ProstT5 model." + " Please run 'anvi-setup-prostt5' and try again.") + + self.output_file = None + + if not self.run.log_file_path: + self.run.log_file_path = filesnpaths.get_temp_file_path() + + self.names_dict = None + + def create_db(self): + self.run.warning(None, header="FOLDSEEK CREATEDB", lc="green") + self.progress.new('FOLDSEEK') + self.progress.update('creating the search database (using %d thread(s)) ...' % self.num_threads) + + expected_output_dir = os.path.join(self.output_file, "db") + expected_output_file = os.path.join(expected_output_dir, "search_db") + + filesnpaths.gen_output_directory(expected_output_dir, delete_if_exists=False) + + cmd_line = ['foldseek', + 'createdb', + self.query_fasta, + expected_output_file, + '--prostt5-model', self.weight_dir, + '--threads', self.num_threads + ] + + try: + utils.run_command(cmd_line, self.run.log_file_path) + except FilesNPathsError: + run.warning("Opss! CREATEDB not working. Probably you are giving wrong file path :/") + + self.progress.end() + self.run.info('Command line', ' '.join([str(x) for x in cmd_line]), quiet=True) + self.run.info('Foldseek search DB', expected_output_file) + + def search(self, query_db, target_db): + self.run.warning(None, header="FOLDSEEK EASY SEARCH", lc="green") + self.progress.new('FOLDSEEK') + self.progress.update('Running search using Foldseek ...') + + query_db = os.path.join(query_db, 'db', 'search_db') + target_db = os.path.join(target_db, 'db', 'search_db') + + result_file_dir = os.path.join(self.output_file, 'result') + + cmd_line = [ + 'foldseek', + 'easy-search', + query_db, + target_db, + result_file_dir, + self.tmp_dir, + '--threads', self.num_threads + ] + + utils.run_command(cmd_line, self.run.log_file_path) + + self.progress.end() + + self.run.info('Command line', ' '.join([str(x) for x in cmd_line]), quiet=True) + self.run.info('Foldseek search Result', result_file_dir) + + def process(self, output_file): + + self.output_file = output_file + + self.create_db() + self.search(output_file, output_file) + + def get_foldseek_results(self): + """ Return result.m8 file """ + force_makedb, force_search = False, False + + result_dir = os.path.join(self.output_file, 'result') + + return result_dir + + +class Prostt5SetupWeight: + """A class to download and setup the weights of PROSTT5""" + def __init__(self, args, run=run, progress=progress): + self.run = run + self.progress = progress + + utils.is_program_exists('foldseek') + + self.weight_dir = args.prostt5_data_dir + + if self.weight_dir and args.reset: + raise ConfigError("You are attempting to run ProstT5 setup on a non-default data directory (%s) using the --reset flag. " + "To avoid automatically deleting a directory that may be important to you, anvi'o refuses to reset " + "directories that have been specified with --weight-dir. If you really want to get rid of this " + "directory and regenerate it with InteracDome data inside, then please remove the directory yourself using " + "a command like `rm -r %s`. We are sorry to make you go through this extra trouble, but it really is " + "the safest way to handle things." % (self.weight_dir, self.weight_dir)) + + if not self.weight_dir: + self.weight_dir = constants.default_prostt5_weight_path + + self.run.warning('', header='Setting up ProstT5 Weights', lc='yellow') + self.run.info('Data directory', self.weight_dir) + self.run.info('Reset contents', args.reset) + + filesnpaths.gen_output_directory(self.weight_dir, delete_if_exists=args.reset) + + filesnpaths.is_output_dir_writable(os.path.dirname(os.path.abspath(self.weight_dir))) + + if not args.reset and not anvio.DEBUG: + self.is_weight_exists() + + if not self.run.log_file_path: + self.run.log_file_path = os.path.join(self.weight_dir, 'foldseek-setup-log-file.txt') + + + def is_weight_exists(self): + """Raise ConfigError if weight exists""" + + if os.path.exists(self.weight_dir) and os.listdir(self.weight_dir): + raise ConfigError("It seems you already have the ProstT5 Weights downloaded in '%s', please " + "use --reset flag if you want to re-download it." % self.weight_dir) + + def setup(self): + """ Sets up the ProstT5 Weights directory for Foldseek """ + + self.run.warning('', header='Downloading Weight Model', lc='yellow') + self.download_foldseek_weight() + + def download_foldseek_weight(self): + """Download the weights of ProstT5 models and clean up temporary files""" + + self.progress.new('FOLDSEEK') + self.progress.update('Downloading ...') + + cmd_line = [ + 'foldseek', + 'databases', + 'ProstT5', + self.weight_dir, + os.path.join(self.weight_dir, 'tmp') + ] + + result = utils.run_command(cmd_line, self.run.log_file_path) + + # Successful condition + if result == 0: + self.progress.end() + + self.run.info('Command line', ' '.join([str(x) for x in cmd_line]), quiet=True) + self.run.info('Log file', self.run.log_file_path) + + # Remove tmp folder after download model + tmp_dir = os.path.join(self.weight_dir, 'tmp') + if os.path.exists(tmp_dir): + try: + shutil.rmtree(tmp_dir) + self.run.info('Temporary folder', f"'{tmp_dir}' was successfully deleted.") + except Exception as e: + self.run.warning('Temporary folder cleanup failed', str(e)) + else: + self.run.warning('Download failed', 'Please check the log file for more details.') \ No newline at end of file diff --git a/anvio/interactive.py b/anvio/interactive.py index c8d06a7a7f..407cf894f0 100644 --- a/anvio/interactive.py +++ b/anvio/interactive.py @@ -1232,6 +1232,16 @@ def load_pan_mode(self): PanSuperclass.__init__(self, self.args) + # Check if this is a structure-informed pan database + if utils.get_db_variant(self.pan_db_path) == 'structure-informed': + self.progress.new("Initializing structure-informed pan data") + self.progress.update('...') + + self.init_gc_tracker() + self.init_gc_psgc_associations() + + self.progress.end() + self.init_gene_clusters() if not self.skip_init_functions: diff --git a/anvio/migrations/pan/v20_to_v21.py b/anvio/migrations/pan/v20_to_v21.py index 188621cbe6..a5a8259d53 100644 --- a/anvio/migrations/pan/v20_to_v21.py +++ b/anvio/migrations/pan/v20_to_v21.py @@ -78,7 +78,7 @@ def migrate(db_path): "you have run `anvi-reaction-network`, so we left them alone and didn't change " "anything in the database besides the version number." ) - message = f"Done! Your pan database is now version 21. This wasn't a biggie. {change_message}" + message = f"Done! Your pan database is now version {current_version}. This wasn't a biggie. {change_message}" run.info_single(message, nl_after=1, nl_before=1, mc='green') if __name__ == '__main__': diff --git a/anvio/migrations/pan/v21_to_v22.py b/anvio/migrations/pan/v21_to_v22.py new file mode 100644 index 0000000000..a2534f2d83 --- /dev/null +++ b/anvio/migrations/pan/v21_to_v22.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import sys +import argparse + +import anvio.dbinfo as dbinfo +import anvio.terminal as terminal + +from anvio.errors import ConfigError + +run = terminal.Run() +progress = terminal.Progress() + +current_version, next_version = [x[1:] for x in __name__.split('_to_')] + + +def migrate(db_path): + if db_path is None: + raise ConfigError("No database path is given.") + + pan_db_info = dbinfo.PanDBInfo(db_path) + if str(pan_db_info.version) != current_version: + raise ConfigError( + f"The version of the provided pan database is {pan_db_info.version}, not the required " + f"version, {current_version}, so this script cannot upgrade the database." + ) + + pan_db = pan_db_info.load_db() + + progress.new("Migrating") + progress.update("Updating the self table with one new variable") + + if 'db_variant' not in pan_db_info.get_self_table(): + pan_db.set_meta_value('db_variant', 'sequence-based') + + progress.update("Updating version") + pan_db.remove_meta_key_value_pair('version') + pan_db.set_version(next_version) + + progress.update("Committing changes") + pan_db.disconnect() + + progress.end() + + message = (f"Done! Your pan database is now version {current_version}. The purpose of this chahge was to make space " + f"for different pan-db variants. So far we only had sequence-based pangeomes. Now anvi'o can generate " + f"structure-informed pangenomes to better identify distant homologs, and with this change the code will be " + f"able to recognize which variant a given pan-db file is.") + run.info_single(message, nl_after=1, nl_before=1, mc='green') + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='A simple script to upgrade the pan database from version %s to version %s' % (current_version, next_version)) + parser.add_argument('pan_db', metavar = 'PAN_DB', help = "An anvi'o pan database of version %s" % current_version) + args, unknown = parser.parse_known_args() + + try: + migrate(args.pan_db) + except ConfigError as e: + print(e) + sys.exit(-1) diff --git a/anvio/panops.py b/anvio/panops.py index 3126dcb7bd..f24effb000 100644 --- a/anvio/panops.py +++ b/anvio/panops.py @@ -34,10 +34,11 @@ from anvio.drivers.diamond import Diamond from anvio.drivers.mcl import MCL from anvio.drivers import Aligners +from anvio.drivers.foldseek import Foldseek from anvio.errors import ConfigError, FilesNPathsError from anvio.genomestorage import GenomeStorage -from anvio.tables.geneclusters import TableForGeneClusters +from anvio.tables.geneclusters import TableForGeneClusters, TableForPSGCGCAssociations from anvio.tables.views import TablesForViews __copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" @@ -89,6 +90,11 @@ def __init__(self, args=None, run=run, progress=progress): self.enforce_hierarchical_clustering = A('enforce_hierarchical_clustering') self.enforce_the_analysis_of_excessive_number_of_genomes = anvio.USER_KNOWS_IT_IS_NOT_A_GOOD_IDEA + self.pan_mode = A('pan_mode') or constants.pangenome_mode_default + self.prostt5_data_dir = A('prostt5_data_dir') + self.foldseek_search_results_output_file = A('foldseek_search_results') + self.de_novo_gene_clusters_dict = None + self.additional_params_for_seq_search = A('additional_params_for_seq_search') self.additional_params_for_seq_search_processed = False @@ -108,9 +114,16 @@ def __init__(self, args=None, run=run, progress=progress): self.additional_view_data = {} self.aligner = None + # this is to fill a table that will only be relevant in structure mode. + self.gc_psgc_associations = [] + # we don't know what we are about self.description = None + # quick accession variables to mode + self.STRUCTURE_MODE = False if self.pan_mode == constants.pangenome_mode_default else True + self.SEQUENCE_MODE = not self.STRUCTURE_MODE + def load_genomes(self): # genome_name parameter can be a file or comma seperated genome names. @@ -158,7 +171,9 @@ def generate_pan_db(self): 'description': self.description if self.description else '_No description is provided_', } - dbops.PanDatabase(self.pan_db_path, quiet=False).create(meta_values) + filesnpaths.is_output_file_writable(self.pan_db_path, ok_if_exists=False) + + dbops.PanDatabase(self.pan_db_path, quiet=False).create(meta_values, db_variant=self.pan_mode) # know thyself. self.args.pan_db = self.pan_db_path @@ -216,6 +231,13 @@ def check_params(self): filesnpaths.is_output_file_writable(self.log_file_path) os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None + if self.pan_mode not in constants.pangenome_modes_available: + raise ConfigError(f"The pangenome mode '{self.pan_mode}' is not something anvi'o recognizes :/ Something is wrong here.") + + if self.STRUCTURE_MODE and self.user_defined_gene_clusters: + raise ConfigError("You are confusing anvi'o :/ You can't use `pan-mode --structure` and `gene_clusters_txt` at the same time." + "When `structure` is active, please skip setting `gene_clusters_txt`.") + if self.user_defined_gene_clusters: filesnpaths.is_gene_clusters_txt(self.user_defined_gene_clusters) @@ -245,7 +267,15 @@ def check_params(self): filesnpaths.is_file_plain_text(self.description_file_path) self.description = open(os.path.abspath(self.description_file_path), 'r').read() - self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db') + if self.STRUCTURE_MODE: + self.pan_db_path = self.get_output_file_path(self.project_name + '-STRUCTURE-PAN.db') + elif self.SEQUENCE_MODE or self.user_defined_gene_clusters: + self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db') + else: + raise ConfigError("Anvi'o is confused since it ended up somewhere that doesn't exist :(") + + if self.foldseek_search_results_output_file: + filesnpaths.is_file_tab_delimited(self.foldseek_search_results_output_file) def process_additional_params(self): @@ -309,7 +339,23 @@ def run_blast(self, unique_AA_sequences_fasta_path, unique_AA_sequences_names_di return blast.get_blast_results() - def run_search(self, unique_AA_sequences_fasta_path, unique_AA_sequences_names_dict): + def run_foldseek(self, fasta_path): + """ Running Foldseek """ + fs = Foldseek(query_fasta=fasta_path, + run=self.run, + progress=self.progress, + num_threads=self.num_threads, + weight_dir=self.prostt5_data_dir, + overwrite_output_destinations=self.overwrite_output_destinations) + + # FIXME this tmp is necessarry for easy-search instead of giving like that we can set default tempfile.gettempdir() in foldseek driver + + fs.process(self.output_dir) + + return fs.get_foldseek_results() + + + def run_search_de_novo(self, unique_AA_sequences_fasta_path, unique_AA_sequences_names_dict): if self.use_ncbi_blast: return self.run_blast(unique_AA_sequences_fasta_path, unique_AA_sequences_names_dict) else: @@ -541,6 +587,11 @@ def process_gene_clusters(self, gene_clusters_dict): item_additional_data_table = miscdata.TableForItemAdditionalData(self.args, r=terminal.Run(verbose=False)) item_additional_data_keys = ['num_genomes_gene_cluster_has_hits', 'num_genes_in_gene_cluster', 'max_num_paralogs', 'SCG'] + + # If we're in structure mode, add information about PSGCs + if self.STRUCTURE_MODE: + self.add_psgc_layers(gene_clusters_dict, item_additional_data_keys) + item_additional_data_table.add(self.additional_view_data, item_additional_data_keys, skip_check_names=True) # ^^^^^^^^^^^^^^^^^^^^^ # / @@ -555,6 +606,123 @@ def process_gene_clusters(self, gene_clusters_dict): return gene_clusters_dict + def add_psgc_layers(self, gene_clusters_dict, item_additional_data_keys): + """Add PSGC-related layers to the additional view data. + + Args: + gene_clusters_dict: Dictionary containing gene cluster information + item_additional_data_keys: List to store the names of additional data keys + """ + psgc_gc_counts = self.count_gene_clusters_per_psgc() + psgc_gc_types = self.classify_gene_types(gene_clusters_dict) + + self.add_layers_to_view(psgc_gc_counts) + item_additional_data_keys.extend([ + 'number_gc_in_psgc', + 'psgc_composition!core', + 'psgc_composition!singleton', + 'psgc_composition!accessory', + 'gc_types' + ]) + + + def count_gene_clusters_per_psgc(self): + """Count the number of gene clusters in each PSGC.""" + psgc_gc_counts = {} + for gc_name, psgc_name in self.gc_psgc_associations: + if psgc_name: + psgc_gc_counts[psgc_name] = psgc_gc_counts.get(psgc_name, 0) + 1 + return psgc_gc_counts + + + def count_genes_per_psgc(self, gene_clusters_dict): + """Count the total number of genes in each PSGC.""" + psgc_gene_counts = {} + + for gc_name, genes in gene_clusters_dict.items(): + psgc_gene_counts[gc_name] = len(genes) + + return psgc_gene_counts + + + def classify_gene_types(self, gene_clusters_dict): + """Classify original gene clusters as core, singleton, or other.""" + + self.run.warning(None, header="GENE TYPE CLASSIFICATION", lc="green") + + de_novo_gc_types = {} + psgc_count = 0 + psgcs_with_genes = 0 + + for gc_name in self.de_novo_gene_clusters_dict: + genomes_with_hits = set() + for gene_entry in self.de_novo_gene_clusters_dict[gc_name]: + genomes_with_hits.add(gene_entry['genome_name']) + + if len(genomes_with_hits) == len(self.genomes): + de_novo_gc_types[gc_name] = 'core' + elif len(genomes_with_hits) == 1: + de_novo_gc_types[gc_name] = 'singleton' + else: + de_novo_gc_types[gc_name] = 'accessory' + + gc_type_counts = {'core': 0, 'singleton': 0, 'accessory': 0} + for gc_type in de_novo_gc_types.values(): + gc_type_counts[gc_type] += 1 + + for psgc_name in gene_clusters_dict: + self.additional_view_data[psgc_name].update({'psgc_composition!core': 0,'psgc_composition!singleton': 0,'psgc_composition!accessory': 0,'gc_types': '{}'}) + + for psgc_name in gene_clusters_dict: + psgc_count += 1 + + gc_types_dict = {} + de_novo_gcs = [] + for gc, psgc in self.gc_psgc_associations: + if psgc == psgc_name and gc in de_novo_gc_types: + de_novo_gcs.append(gc) + gc_types_dict[gc] = de_novo_gc_types[gc] + + core_genes = 0 + singleton_genes = 0 + accessory_genes = 0 + + for gc in de_novo_gcs: + gc_type = de_novo_gc_types[gc] + gene_count = len(self.de_novo_gene_clusters_dict[gc]) + + if gc_type == 'core': + core_genes += gene_count + elif gc_type == 'singleton': + singleton_genes += gene_count + else: + accessory_genes += gene_count + + # necessarry for gc_types search functionality, If they are 0 we shouldn't see them in the search result. + # If you not happy with the code you can update gc_types_dict["something-not-include-core|accessory|singleton"] :) + if core_genes > 0: + gc_types_dict['total_core_genes'] = core_genes + if singleton_genes > 0: + gc_types_dict['total_singleton_genes'] = singleton_genes + if accessory_genes > 0: + gc_types_dict['total_accessory_genes'] = accessory_genes + + self.additional_view_data[psgc_name]['gc_types'] = json.dumps(gc_types_dict) + + if core_genes > 0 or singleton_genes > 0 or accessory_genes > 0: + psgcs_with_genes += 1 + self.additional_view_data[psgc_name].update({'psgc_composition!core': core_genes,'psgc_composition!singleton': singleton_genes,'psgc_composition!accessory': accessory_genes}) + + self.run.info('PSGCs classified', f"{psgcs_with_genes} of {psgc_count}") + self.run.info('Gene cluster types', f"Core: {gc_type_counts['core']}, "f"Singleton: {gc_type_counts['singleton']}, "f"Accessory: {gc_type_counts['accessory']}") + + + def add_layers_to_view(self, psgc_gc_counts): + """Add all PSGC layers to the view data.""" + for gene_cluster in self.view_data: + self.additional_view_data[gene_cluster].update({ 'number_gc_in_psgc': psgc_gc_counts.get(gene_cluster, 0) }) + + def gen_synteny_based_ordering_of_gene_clusters(self, gene_clusters_dict): """Take the dictionary of gene_clusters, and order gene_clusters per genome based on synteny of genes. @@ -754,6 +922,23 @@ def sanity_check(self): self.run.info('Args', (str(self.args)), quiet=True) + def populate_gene_cluster_tracker_table(self, gene_clusters_dict): + self.progress.new('Storing de novo gene clusters for future references') + self.progress.update('...') + + table_for_gene_clusters = TableForGeneClusters(self.pan_db_path, gc_tracker_table=True, run=self.run, progress=self.progress) + + num_genes_in_gene_clusters = 0 + for gene_cluster_name in gene_clusters_dict: + for gene_entry in gene_clusters_dict[gene_cluster_name]: + table_for_gene_clusters.add(gene_entry) + num_genes_in_gene_clusters += 1 + + self.progress.end() + + table_for_gene_clusters.store() + + def store_gene_clusters(self, gene_clusters_dict): self.progress.new('Storing gene clusters in the database') self.progress.update('...') @@ -770,13 +955,93 @@ def store_gene_clusters(self, gene_clusters_dict): table_for_gene_clusters.store() + self.progress.new('Storing gc <-> psgc associations in the database') + self.progress.update('...') + + if self.STRUCTURE_MODE: + # if we are in structure mode, we also need to update the `TableForPSGCGCAssociations` + table_for_gc_psgc_associations = TableForPSGCGCAssociations(self.pan_db_path, run=self.run, progress=self.progress) + + for gc_name, psgc_name in self.gc_psgc_associations: + table_for_gc_psgc_associations.add({'gene_cluster_id': gc_name, 'protein_structure_informed_gene_cluster_id': psgc_name}) + + self.progress.end() + + table_for_gc_psgc_associations.store() + + pan_db = dbops.PanDatabase(self.pan_db_path, quiet=True) pan_db.db.set_meta_value('num_gene_clusters', len(gene_clusters_dict)) pan_db.db.set_meta_value('num_genes_in_gene_clusters', num_genes_in_gene_clusters) pan_db.disconnect() - def gen_gene_clusters_dict_from_mcl_clusters(self, mcl_clusters): + def gen_protein_structure_informed_gene_clusters_dict_from_mcl_clusters(self, mcl_clusters, gene_clusters_de_novo, name_prefix="PSGC"): + """Turn MCL clusters from structure search into a gene clusters dict + + The only difference between this function and `gen_de_novo_gene_clusters_dict_from_mcl_clusters` is that + the latter takes in de novo gene clusters, where hashes must be replaced with actual genome names. In + contrast, this function takes clusters of gene clusters, which must be turned into a dictionary where + gene clusters resolved back to individual genes they had originally. + """ + + # we will build the following dictionary here from the MCL results and de novo gene clusters + protein_structure_informed_gene_clusters_dict = {} + + counter = 0 + for gene_cluster_group in mcl_clusters.values(): + counter += 1 + psgc_name = f"{name_prefix}_{counter:08d}" + + protein_structure_informed_gene_clusters_dict[psgc_name] = [] + + # For each MCL gene cluster group, retrieve gene entries from gene_clusters_de_novo and add them to the new dictionary. + for gc_name in gene_cluster_group: + + # as we are now going through de novo gene clusters that are represented in protein structure-informed + # clusters, we will update our talbe to track psgc <-> gc associations table + self.gc_psgc_associations.append((gc_name, psgc_name)) + + # go through every gene entry in de novo clusters + for gene_entry in gene_clusters_de_novo[gc_name]: + protein_structure_informed_gene_clusters_dict[psgc_name].append({ + 'gene_caller_id': int(gene_entry['gene_caller_id']), + 'gene_cluster_id': psgc_name, + 'genome_name': gene_entry['genome_name'], + 'alignment_summary': gene_entry.get('alignment_summary', '') + }) + + # there is one last house-keeping business to do here. now we have our psgc <-> gc assocaitions + # table for downsteram steps. as we went through psgcs and matched gcs that were contained in + # them with the de novo gcs, we don't know if there are any de novo gcs that did not end up in + # any of the psgcs. this is an extremely unlikely scenario, but those are the kinds of things + # we are here for. so we will go t + known_gcs = set(list(gene_clusters_de_novo.keys())) + gcs_found_in_psgcs = set([tpl[0] for tpl in self.gc_psgc_associations]) + gcs_not_reprsented_in_psgcs = known_gcs.difference(gcs_found_in_psgcs) + if gcs_not_reprsented_in_psgcs: + self.run.warning(f"Anvi'o noticed something unexpected, and she wants you to know about it. It seems " + f"{pp(len(gcs_not_reprsented_in_psgcs))} of {pp(len(known_gcs))} gene clusters anvi'o " + f"computed based on their amino acid sequences somehow were lost when protein " + f"structure-informed gene clusters were computed from their representatives. We are " + f"not sure under what circumstances this can happen. If you are seeing this error " + f"message, you can help us implement more functionality in this class to debug it " + f"properly. Here is the list of de novo gene clusters that were lost along the way: " + f"{', '.join(gcs_not_reprsented_in_psgcs)}") + + for gc_name in gcs_not_reprsented_in_psgcs: + self.gc_psgc_associations.append((gc_name, None)) + + self.run.warning(f"Hi! Anvi'o just finished calculating your protein structure-informed gene clusters dictionary " + f"that consolidated {pp(len(known_gcs))} gene clusters (that were generated based on the good ol' " + f"amino acid sequence homology considerations) into {pp(len(protein_structure_informed_gene_clusters_dict))} " + f"protein structure-informed gene clusters. We keep our fingers crossed that something will not " + f"explode after this.", header="🎉 GCs TO PSGCs IS WORKING 🎉", lc='green') + + return protein_structure_informed_gene_clusters_dict + + + def gen_de_novo_gene_clusters_dict_from_mcl_clusters(self, mcl_clusters): self.progress.new('Generating the gene clusters dictionary from raw MCL clusters') self.progress.update('...') @@ -931,9 +1196,86 @@ def alignment_worker(input_queue, output_queue, gene_clusters_dict, genomes_stor output_queue.put(output) - def get_gene_clusters_de_novo(self): + def get_gene_cluster_representative_sequences(self, gene_clusters_dict): + """A very simple way to select representatives per GC based on minimum number of gaps and max length""" + + representative_sequences = {} + + # here we will do three things. first, get the aa sequence for each gene in de novo gene clusters, + # and then reover restored alignments, and build a new dictionary that will containa single + # representative sequence for each gene cluster + for gc_name in gene_clusters_dict: + sequence_info = [] + for gc_info in gene_clusters_dict[gc_name]: + # recover key data + genome_name = gc_info['genome_name'] + gene_caller_id = gc_info['gene_caller_id'] + alignment_summary = gc_info['alignment_summary'] + + # recover restored alignment for sequence + sequence = utils.restore_alignment(self.genomes_storage.gene_info[genome_name][gene_caller_id]['aa_sequence'], alignment_summary) + sequence_info.append((sequence.count('-'), len(sequence), sequence)) + + # sort based on minimum number of gaps and maximum length + representative_sequence = sorted(sequence_info, key=lambda x:(x[0], -x[1]))[0][2] + + # store the representative sequence + representative_sequences[gc_name] = {'sequence': representative_sequence} + + return representative_sequences + + + def get_structure_informed_gene_clusters(self): """Function to compute gene clusters de novo""" + # get de novo gene clusters first to reduce search space for structure + self.de_novo_gene_clusters_dict = self.get_sequence_based_gene_clusters() # Store as class variable + + # store gene clusters dict into the db. this one is the de novo gene cluster dictionary, + # not the structure one yet. but this dict will give access to the details of how de novo gene + # clusters distributed across genomes in conjunction with the gc_psgc_associations table + self.populate_gene_cluster_tracker_table(self.de_novo_gene_clusters_dict) + + # next, we will align non-singleton gene clusters to make sure we have all the information + # we need to be able to pick an appropriate representative for each gene cluster. + skip_alignments = self.skip_alignments + self.skip_alignments = False + self.de_novo_gene_clusters_dict, unsuccessful_alignments = self.compute_alignments_for_gene_clusters(self.de_novo_gene_clusters_dict) + self.skip_alignments = skip_alignments + + # now the `de_novo_gene_clusters_dict` contains entries with alignment summaries. time to get gene cluster + # representative sequences + gene_cluster_representatives = self.get_gene_cluster_representative_sequences(self.de_novo_gene_clusters_dict) + + # next, we will generate a FASTA file for gene cluster representatives, which will be analyzed + # with foldseek + gene_cluster_representatives_FASTA_path = self.get_output_file_path('gene-cluster-representatives-aa.fa') + with open(gene_cluster_representatives_FASTA_path, 'w') as output: + for gc_name in gene_cluster_representatives: + output.write(f">{gc_name}\n{gene_cluster_representatives[gc_name]['sequence']}\n") + + # get foldseek search results (please NOTE that if there is a user-provided search output, we utilize that here) + if not self.foldseek_search_results_output_file: + self.foldseek_search_results_output_file = self.run_foldseek(gene_cluster_representatives_FASTA_path) + + # generate MCL input from filtered foldseek_result + mcl_input_file_path = self.gen_mcl_input(self.foldseek_search_results_output_file) + + # get clusters from MCL + mcl_clusters = self.run_mcl(mcl_input_file_path) + + # we have the raw gene clusters dict, but we need to re-format it for following steps + protein_structure_informed_gene_clusters_dict = self.gen_protein_structure_informed_gene_clusters_dict_from_mcl_clusters(mcl_clusters, self.de_novo_gene_clusters_dict) + + # invoke the garbage collector to clean up some mess + del mcl_clusters + + return protein_structure_informed_gene_clusters_dict + + + def get_sequence_based_gene_clusters(self): + """Function to compute gene clusters de novo based on amino acid sequences""" + # get all amino acid sequences: combined_aas_FASTA_path = self.get_output_file_path('combined-aas.fa') self.genomes_storage.gen_combined_aa_sequences_FASTA(combined_aas_FASTA_path, @@ -947,7 +1289,7 @@ def get_gene_clusters_de_novo(self): self.run.info('Unique AA sequences FASTA', unique_aas_FASTA_path) # run search - blastall_results = self.run_search(unique_aas_FASTA_path, unique_aas_names_dict) + blastall_results = self.run_search_de_novo(unique_aas_FASTA_path, unique_aas_names_dict) # generate MCL input from filtered blastall_results mcl_input_file_path = self.gen_mcl_input(blastall_results) @@ -956,7 +1298,7 @@ def get_gene_clusters_de_novo(self): mcl_clusters = self.run_mcl(mcl_input_file_path) # we have the raw gene clusters dict, but we need to re-format it for following steps - gene_clusters_dict = self.gen_gene_clusters_dict_from_mcl_clusters(mcl_clusters) + gene_clusters_dict = self.gen_de_novo_gene_clusters_dict_from_mcl_clusters(mcl_clusters) del mcl_clusters return gene_clusters_dict @@ -1017,8 +1359,10 @@ def process(self): # get them from the user themselves through gene-clusters-txt if self.user_defined_gene_clusters: gene_clusters_dict = self.get_gene_clusters_from_gene_clusters_txt() + elif self.STRUCTURE_MODE: + gene_clusters_dict = self.get_structure_informed_gene_clusters() else: - gene_clusters_dict = self.get_gene_clusters_de_novo() + gene_clusters_dict = self.get_sequence_based_gene_clusters() # compute alignments for genes within each gene_cluster (or don't) gene_clusters_dict, unsuccessful_alignments = self.compute_alignments_for_gene_clusters(gene_clusters_dict) diff --git a/anvio/summarizer.py b/anvio/summarizer.py index 9452f5e0a0..5ae92a3d08 100644 --- a/anvio/summarizer.py +++ b/anvio/summarizer.py @@ -100,6 +100,7 @@ def __init__(self): self.output_dir = filesnpaths.get_temp_directory_path() self.report_aa_seqs_for_gene_calls = False self.reformat_contig_names = False + self.init_pan_mode = False class SummarizerSuperClass(object): @@ -264,6 +265,36 @@ def __init__(self, args=None, lazy_init=False, r=run, p=progress): self.cog_functions_are_called = 'COG_FUNCTION' in self.gene_clusters_function_sources self.cog_categories_are_called = 'COG_CATEGORY' in self.gene_clusters_function_sources + self.STRUCTURE_MODE = False if self.args.init_pan_mode == constants.pangenome_mode_default else True + + self.summary = {} + self.summary['basics_pretty'] = {} + + + def add_structure_data_tables(self): + """Add additional data tables from the pan database and return the data.""" + from anvio.dbops import PanDatabase + pan_db = PanDatabase(self.pan_db_path) + + gc_psgc_associations_data = pan_db.db.get_table_as_dict('gc_psgc_associations') + gc_tracker_data = pan_db.db.get_table_as_dict('gc_tracker') + + # store associations + gc_psgc_associations = {} + for gene_cluster_id, data in gc_psgc_associations_data.items(): + protein_structure_id = data['protein_structure_informed_gene_cluster_id'] + gc_psgc_associations[gene_cluster_id] = protein_structure_id + + gc_tracker = [] + for entry in gc_tracker_data.values(): + gene_caller_id = entry['gene_caller_id'] + gene_cluster_id = entry['gene_cluster_id'] + genome_name = entry['genome_name'] + alignment_summary = entry['alignment_summary'] + + gc_tracker.append((gene_caller_id, gene_cluster_id, genome_name, alignment_summary)) + + return gc_psgc_associations, gc_tracker def get_occurrence_of_functions_in_pangenome(self, gene_clusters_functions_summary_dict): """ @@ -600,6 +631,10 @@ def process(self): def generate_gene_clusters_file(self, collection_dict, compress_output=True): """Generates the gene summary file""" + # Add data tables if STRUCTURE_MODE is True + if self.STRUCTURE_MODE: + gc_psgc_associations_data, gc_tracker_data = self.add_structure_data_tables() + # generate a dict of gene cluster ~ bin id relationships gene_cluster_name_to_bin_name= dict(list(zip(self.gene_clusters_in_pan_db_but_not_binned, [None] * len(self.gene_clusters_in_pan_db_but_not_binned)))) for bin_id in collection_dict: @@ -614,6 +649,10 @@ def generate_gene_clusters_file(self, collection_dict, compress_output=True): # standard headers header = ['unique_id', 'gene_cluster_id', 'bin_name', 'genome_name', 'gene_callers_id'] + if self.STRUCTURE_MODE: + header.append('gc_in_psgc') + header.append('gc_tracker') + # extend the header with items additional data keys for items_additional_data_key in self.items_additional_data_keys: header.append(items_additional_data_key) @@ -645,6 +684,29 @@ def generate_gene_clusters_file(self, collection_dict, compress_output=True): for gene_caller_id in self.gene_clusters[gene_cluster_name][genome_name]: entry = [unique_id, gene_cluster_name, gene_cluster_name_to_bin_name[gene_cluster_name], genome_name, gene_caller_id] + if self.STRUCTURE_MODE: + gc_ids = [] + for gc_id, psgc_id in gc_psgc_associations_data.items(): + if psgc_id == gene_cluster_name: + gc_ids.append(gc_id) + + associations_string = f"[{', '.join(gc_ids)}]" + entry.append(associations_string) + + matching_gc = None + for tracker_entry in gc_tracker_data: + if (int(tracker_entry[0]) == int(gene_caller_id) and + str(tracker_entry[2]) == str(genome_name)): + matching_gc = tracker_entry[1] + break + + if matching_gc: + entry.append(matching_gc) + else: + entry.append('') + else: + continue + # populate the entry with item aditional data for items_additional_data_key in self.items_additional_data_keys: if gene_cluster_name in self.items_additional_data_dict: diff --git a/anvio/tables/__init__.py b/anvio/tables/__init__.py index 6ef632fe12..935c14b629 100644 --- a/anvio/tables/__init__.py +++ b/anvio/tables/__init__.py @@ -16,7 +16,7 @@ contigs_db_version = "24" profile_db_version = "40" genes_db_version = "6" -pan_db_version = "21" +pan_db_version = "22" auxiliary_data_version = "2" structure_db_version = "2" genomes_storage_vesion = "7" @@ -46,6 +46,14 @@ pan_gene_clusters_table_structure = ['gene_caller_id', 'gene_cluster_id', 'genome_name', 'alignment_summary'] pan_gene_clusters_table_types = [ 'numeric' , 'str' , 'str' , 'str' ] +pan_gc_tracker_table_name = 'gc_tracker' # the purpose of this table is to keep track of the sequence-based GCs in structure mode +pan_gc_tracker_table_structure = ['gene_caller_id', 'gene_cluster_id', 'genome_name', 'alignment_summary'] +pan_gc_tracker_table_types = [ 'numeric' , 'str' , 'str' , 'str' ] + +pan_gc_psgc_associations_table_name = 'gc_psgc_associations' # which sequence-based GCs ended up in which PSGCs +pan_gc_psgc_associations_table_structure = ['gene_cluster_id', 'protein_structure_informed_gene_cluster_id'] +pan_gc_psgc_associations_table_types = [ 'str' , 'str' ] + pan_reaction_network_reactions_table_name = 'pan_reaction_network_reactions' pan_reaction_network_reactions_table_structure = ['modelseed_reaction_id', 'modelseed_reaction_name', 'ko_kegg_reaction_source', 'ko_ec_number_source', 'other_kegg_reaction_ids', 'other_ec_numbers', 'metabolite_modelseed_ids', 'stoichiometry', 'compartments', 'reversibility'] pan_reaction_network_reactions_table_types = [ 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'text' , 'bool' ] @@ -407,6 +415,8 @@ 'max_normalized_ratio_splits': False, 'relative_abundance_splits': False, pan_gene_clusters_table_name: True, + pan_gc_tracker_table_name: True, + pan_gc_psgc_associations_table_name: False, # the first entry is always the de novo gene cluster name, which should be unique 'gene_cluster_function_reactions': False, # renamed to 'pan_reaction_network_reactions' pan_reaction_network_reactions_table_name: False, 'gene_cluster_function_metabolites': False, # renamed to 'pan_reaction_network_metabolites' diff --git a/anvio/tables/geneclusters.py b/anvio/tables/geneclusters.py index 3b57bbca3b..ad525952d7 100644 --- a/anvio/tables/geneclusters.py +++ b/anvio/tables/geneclusters.py @@ -29,17 +29,26 @@ class TableForGeneClusters(Table): """A class to populate gene clusters table in a given pan db. - Here is an example: + Here is an example: >>> table = TableForGeneClusters(db_path) >>> for ...: >>> table.add({'gene_caller_id': gene_caller_id, 'gene_cluster_id': gene_cluster_id, 'genome_name': genome_name}) >>> table.store() + + Please note the very important `gc_tracker_table` parameter. When this paramter is set to True, + the `store` function will no longer use the default table name for gene cluster storage, but an alternative + table to keep track of sequence-based gene clusters in structure mode. """ - def __init__(self, db_path, run=run, progress=progress): + def __init__(self, db_path, gc_tracker_table=False, run=run, progress=progress): self.db_path = db_path + if gc_tracker_table: + self.table_name, self.table_structure = t.pan_gc_tracker_table_name, t.pan_gc_tracker_table_structure + else: + self.table_name, self.table_structure = t.pan_gene_clusters_table_name, t.pan_gene_clusters_table_structure + utils.is_pan_db(db_path) self.run = run @@ -51,7 +60,7 @@ def __init__(self, db_path, run=run, progress=progress): def add(self, entry_dict): - self.entries.append([entry_dict[key] for key in t.pan_gene_clusters_table_structure]) + self.entries.append([entry_dict[key] for key in self.table_structure]) def store(self): @@ -60,7 +69,46 @@ def store(self): db_entries = [tuple(entry) for entry in self.entries] database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) - database._exec_many('''INSERT INTO %s VALUES (?,?,?,?)''' % t.pan_gene_clusters_table_name, db_entries) + database._exec_many('''INSERT INTO %s VALUES (?,?,?,?)''' % self.table_name, db_entries) + database.disconnect() + + + +class TableForPSGCGCAssociations(Table): + """A class to populate protein structure-informed gene cluster <-> de novo gene cluster associations in a pan db. + + Here is an example: + + >>> table = TableForPSGCGCAssociations(db_path) + >>> for ...: + >>> table.add({'gene_cluster_id': gene_cluster_id, 'protein_structure_informed_gene_cluster_id': psgc_id}) + >>> table.store() + """ + + def __init__(self, db_path, run=run, progress=progress): + self.db_path = db_path + + utils.is_pan_db(db_path) + + self.run = run + self.progress = progress + + Table.__init__(self, self.db_path, anvio.__pan__version__, run, progress) + + self.entries = [] + + + def add(self, entry_dict): + self.entries.append([entry_dict[key] for key in t.pan_gc_psgc_associations_table_structure]) + + + def store(self): + self.delete_contents_of_table(t.pan_gc_psgc_associations_table_name, warning=False) + + db_entries = [tuple(entry) for entry in self.entries] + + database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) + database._exec_many('''INSERT INTO %s VALUES (?,?)''' % t.pan_gc_psgc_associations_table_name, db_entries) database.disconnect() diff --git a/anvio/tests/run_component_tests_for_structure_informed_pangenomics.sh b/anvio/tests/run_component_tests_for_structure_informed_pangenomics.sh new file mode 100755 index 0000000000..784e642423 --- /dev/null +++ b/anvio/tests/run_component_tests_for_structure_informed_pangenomics.sh @@ -0,0 +1,98 @@ +#!/bin/bash +source 00.sh + +# Setup ############################# +SETUP_WITH_OUTPUT_DIR $1 $2 $3 +##################################### + +INFO "Setting up the structure-informed pangenome analysis directory" +mkdir -p $output_dir/ +cp $files/mock_data_for_pangenomics/*.db $output_dir/ +cp $files/mock_data_for_pangenomics/external-genomes.txt $output_dir/ +cp $files/mock_data_for_pangenomics/example-gene-clusters-collection.txt $output_dir/ +cp $files/mock_data_for_pangenomics/scg-gene-clusters-for-phylogenomics.txt $output_dir/ +cp $files/mock_data_for_pangenomics/default-state.json $output_dir/ +cp $files/example_description.md $output_dir/ +cp $files/mock_data_for_pangenomics/group-information.txt $output_dir/ +cd $output_dir/ + +INFO "Migrating all databases" +anvi-migrate *db --migrate-quickly + +INFO "Generating an anvi'o genomes storage" +anvi-gen-genomes-storage -e external-genomes.txt \ + -o TEST-GENOMES.db \ + --no-progress + +INFO "Running the structure-informed pangenome analysis with default parameters" +anvi-pan-genome -g TEST-GENOMES.db \ + -o TEST/ \ + -n TEST \ + --pan-mode structure-informed \ + --description example_description.md \ + --no-progress \ + $thread_controller + +INFO "Importing group information as misc data for layers" +anvi-import-misc-data -p TEST/TEST-STRUCTURE-PAN.db \ + -t layers \ + group-information.txt \ + --no-progress + +INFO "Estimating enriched functions per pan group" +anvi-compute-functional-enrichment-in-pan -p TEST/TEST-STRUCTURE-PAN.db \ + -g TEST-GENOMES.db \ + --category group \ + --annotation-source COG20_FUNCTION \ + -o functions-enrichment.txt \ + -F functional-occurence.txt \ + --no-progress +SHOW_FILE functions-enrichment.txt +SHOW_FILE functional-occurence.txt + +INFO "Importing collections of gene clusters" +anvi-import-collection -p TEST/TEST-STRUCTURE-PAN.db \ + -C test_collection example-protein-structure-informed-gene-clusters-collection.txt \ + --no-progress + +INFO "Summarizing the pan, using the test collection (in quick mode)" +anvi-summarize -p TEST/TEST-STRUCTURE-PAN.db \ + -g TEST-GENOMES.db \ + -C test_collection \ + -o TEST_SUMMARY_QUICK \ + --quick \ + --no-progress + +INFO "Summarizing the pan, using the test collection" +anvi-summarize -p TEST/TEST-STRUCTURE-PAN.db \ + -g TEST-GENOMES.db \ + -C test_collection \ + -o TEST_SUMMARY \ + --no-progress + +INFO "Splitting bins in the pan genome into smaller, self-contained pan databases" +anvi-split -p TEST/TEST-STRUCTURE-PAN.db \ + -g TEST-GENOMES.db \ + -C test_collection \ + -o TEST_SPLIT_PAN + +INFO "Resulting split pans" +ls -l TEST_SPLIT_PAN/*/*db + +INFO "Taking a look at the make up one of the split pans" +anvi-db-info TEST_SPLIT_PAN/GENE_CLUSTER_BIN_1_CORE/PAN.db + +INFO "Listing collections available" +anvi-show-collections-and-bins -p TEST/TEST-STRUCTURE-PAN.db \ + --no-progress + +INFO "Importing the default state for pretty outputs" +anvi-import-state -p TEST/TEST-STRUCTURE-PAN.db -s default-state.json -n default +anvi-import-state -p TEST/ANOTHER_TEST-PAN.db -s default-state.json -n default + +INFO "Displaying the initial structure informed pangenome analysis results" +anvi-display-pan -p TEST/TEST-STRUCTURE-PAN.db \ + -g TEST-GENOMES.db \ + --title "A mock structure informed pangenome analysis" \ + --no-progress \ + $dry_run_controller diff --git a/anvio/tests/sandbox/mock_data_for_pangenomics/example-protein-structure-informed-gene-clusters-collection.txt b/anvio/tests/sandbox/mock_data_for_pangenomics/example-protein-structure-informed-gene-clusters-collection.txt new file mode 100644 index 0000000000..6a91c19d95 --- /dev/null +++ b/anvio/tests/sandbox/mock_data_for_pangenomics/example-protein-structure-informed-gene-clusters-collection.txt @@ -0,0 +1,348 @@ +PSGC_00000188 GENE_CLUSTER_BIN_1_CORE +PSGC_00000187 GENE_CLUSTER_BIN_1_CORE +PSGC_00000186 GENE_CLUSTER_BIN_1_CORE +PSGC_00000185 GENE_CLUSTER_BIN_1_CORE +PSGC_00000184 GENE_CLUSTER_BIN_1_CORE +PSGC_00000183 GENE_CLUSTER_BIN_1_CORE +PSGC_00000182 GENE_CLUSTER_BIN_1_CORE +PSGC_00000181 GENE_CLUSTER_BIN_1_CORE +PSGC_00000180 GENE_CLUSTER_BIN_1_CORE +PSGC_00000179 GENE_CLUSTER_BIN_1_CORE +PSGC_00000178 GENE_CLUSTER_BIN_1_CORE +PSGC_00000177 GENE_CLUSTER_BIN_1_CORE +PSGC_00000176 GENE_CLUSTER_BIN_1_CORE +PSGC_00000175 GENE_CLUSTER_BIN_1_CORE +PSGC_00000174 GENE_CLUSTER_BIN_1_CORE +PSGC_00000173 GENE_CLUSTER_BIN_1_CORE +PSGC_00000172 GENE_CLUSTER_BIN_1_CORE +PSGC_00000171 GENE_CLUSTER_BIN_1_CORE +PSGC_00000170 GENE_CLUSTER_BIN_1_CORE +PSGC_00000169 GENE_CLUSTER_BIN_1_CORE +PSGC_00000168 GENE_CLUSTER_BIN_1_CORE +PSGC_00000167 GENE_CLUSTER_BIN_1_CORE +PSGC_00000166 GENE_CLUSTER_BIN_1_CORE +PSGC_00000165 GENE_CLUSTER_BIN_1_CORE +PSGC_00000164 GENE_CLUSTER_BIN_1_CORE +PSGC_00000163 GENE_CLUSTER_BIN_1_CORE +PSGC_00000162 GENE_CLUSTER_BIN_1_CORE +PSGC_00000161 GENE_CLUSTER_BIN_1_CORE +PSGC_00000160 GENE_CLUSTER_BIN_1_CORE +PSGC_00000159 GENE_CLUSTER_BIN_1_CORE +PSGC_00000158 GENE_CLUSTER_BIN_1_CORE +PSGC_00000157 GENE_CLUSTER_BIN_1_CORE +PSGC_00000156 GENE_CLUSTER_BIN_1_CORE +PSGC_00000155 GENE_CLUSTER_BIN_1_CORE +PSGC_00000154 GENE_CLUSTER_BIN_1_CORE +PSGC_00000153 GENE_CLUSTER_BIN_1_CORE +PSGC_00000152 GENE_CLUSTER_BIN_1_CORE +PSGC_00000151 GENE_CLUSTER_BIN_1_CORE +PSGC_00000150 GENE_CLUSTER_BIN_1_CORE +PSGC_00000149 GENE_CLUSTER_BIN_1_CORE +PSGC_00000148 GENE_CLUSTER_BIN_1_CORE +PSGC_00000147 GENE_CLUSTER_BIN_1_CORE +PSGC_00000146 GENE_CLUSTER_BIN_1_CORE +PSGC_00000145 GENE_CLUSTER_BIN_1_CORE +PSGC_00000144 GENE_CLUSTER_BIN_1_CORE +PSGC_00000143 GENE_CLUSTER_BIN_1_CORE +PSGC_00000142 GENE_CLUSTER_BIN_1_CORE +PSGC_00000141 GENE_CLUSTER_BIN_1_CORE +PSGC_00000140 GENE_CLUSTER_BIN_1_CORE +PSGC_00000139 GENE_CLUSTER_BIN_1_CORE +PSGC_00000138 GENE_CLUSTER_BIN_1_CORE +PSGC_00000137 GENE_CLUSTER_BIN_1_CORE +PSGC_00000136 GENE_CLUSTER_BIN_1_CORE +PSGC_00000135 GENE_CLUSTER_BIN_1_CORE +PSGC_00000134 GENE_CLUSTER_BIN_1_CORE +PSGC_00000133 GENE_CLUSTER_BIN_1_CORE +PSGC_00000132 GENE_CLUSTER_BIN_1_CORE +PSGC_00000131 GENE_CLUSTER_BIN_1_CORE +PSGC_00000130 GENE_CLUSTER_BIN_1_CORE +PSGC_00000129 GENE_CLUSTER_BIN_1_CORE +PSGC_00000128 GENE_CLUSTER_BIN_1_CORE +PSGC_00000127 GENE_CLUSTER_BIN_1_CORE +PSGC_00000126 GENE_CLUSTER_BIN_1_CORE +PSGC_00000125 GENE_CLUSTER_BIN_1_CORE +PSGC_00000124 GENE_CLUSTER_BIN_1_CORE +PSGC_00000123 GENE_CLUSTER_BIN_1_CORE +PSGC_00000122 GENE_CLUSTER_BIN_1_CORE +PSGC_00000121 GENE_CLUSTER_BIN_1_CORE +PSGC_00000120 GENE_CLUSTER_BIN_1_CORE +PSGC_00000119 GENE_CLUSTER_BIN_1_CORE +PSGC_00000118 GENE_CLUSTER_BIN_1_CORE +PSGC_00000117 GENE_CLUSTER_BIN_1_CORE +PSGC_00000116 GENE_CLUSTER_BIN_1_CORE +PSGC_00000115 GENE_CLUSTER_BIN_1_CORE +PSGC_00000114 GENE_CLUSTER_BIN_1_CORE +PSGC_00000113 GENE_CLUSTER_BIN_1_CORE +PSGC_00000112 GENE_CLUSTER_BIN_1_CORE +PSGC_00000111 GENE_CLUSTER_BIN_1_CORE +PSGC_00000110 GENE_CLUSTER_BIN_1_CORE +PSGC_00000109 GENE_CLUSTER_BIN_1_CORE +PSGC_00000108 GENE_CLUSTER_BIN_1_CORE +PSGC_00000107 GENE_CLUSTER_BIN_1_CORE +PSGC_00000106 GENE_CLUSTER_BIN_1_CORE +PSGC_00000105 GENE_CLUSTER_BIN_1_CORE +PSGC_00000104 GENE_CLUSTER_BIN_1_CORE +PSGC_00000103 GENE_CLUSTER_BIN_1_CORE +PSGC_00000102 GENE_CLUSTER_BIN_1_CORE +PSGC_00000101 GENE_CLUSTER_BIN_1_CORE +PSGC_00000100 GENE_CLUSTER_BIN_1_CORE +PSGC_00000099 GENE_CLUSTER_BIN_1_CORE +PSGC_00000098 GENE_CLUSTER_BIN_1_CORE +PSGC_00000097 GENE_CLUSTER_BIN_1_CORE +PSGC_00000096 GENE_CLUSTER_BIN_1_CORE +PSGC_00000094 GENE_CLUSTER_BIN_1_CORE +PSGC_00000093 GENE_CLUSTER_BIN_1_CORE +PSGC_00000092 GENE_CLUSTER_BIN_1_CORE +PSGC_00000091 GENE_CLUSTER_BIN_1_CORE +PSGC_00000090 GENE_CLUSTER_BIN_1_CORE +PSGC_00000089 GENE_CLUSTER_BIN_1_CORE +PSGC_00000088 GENE_CLUSTER_BIN_1_CORE +PSGC_00000087 GENE_CLUSTER_BIN_1_CORE +PSGC_00000086 GENE_CLUSTER_BIN_1_CORE +PSGC_00000085 GENE_CLUSTER_BIN_1_CORE +PSGC_00000084 GENE_CLUSTER_BIN_1_CORE +PSGC_00000083 GENE_CLUSTER_BIN_1_CORE +PSGC_00000082 GENE_CLUSTER_BIN_1_CORE +PSGC_00000081 GENE_CLUSTER_BIN_1_CORE +PSGC_00000080 GENE_CLUSTER_BIN_1_CORE +PSGC_00000079 GENE_CLUSTER_BIN_1_CORE +PSGC_00000078 GENE_CLUSTER_BIN_1_CORE +PSGC_00000077 GENE_CLUSTER_BIN_1_CORE +PSGC_00000076 GENE_CLUSTER_BIN_1_CORE +PSGC_00000075 GENE_CLUSTER_BIN_1_CORE +PSGC_00000074 GENE_CLUSTER_BIN_1_CORE +PSGC_00000073 GENE_CLUSTER_BIN_1_CORE +PSGC_00000072 GENE_CLUSTER_BIN_1_CORE +PSGC_00000071 GENE_CLUSTER_BIN_1_CORE +PSGC_00000070 GENE_CLUSTER_BIN_1_CORE +PSGC_00000069 GENE_CLUSTER_BIN_1_CORE +PSGC_00000068 GENE_CLUSTER_BIN_1_CORE +PSGC_00000067 GENE_CLUSTER_BIN_1_CORE +PSGC_00000066 GENE_CLUSTER_BIN_1_CORE +PSGC_00000065 GENE_CLUSTER_BIN_1_CORE +PSGC_00000064 GENE_CLUSTER_BIN_1_CORE +PSGC_00000063 GENE_CLUSTER_BIN_1_CORE +PSGC_00000062 GENE_CLUSTER_BIN_1_CORE +PSGC_00000061 GENE_CLUSTER_BIN_1_CORE +PSGC_00000060 GENE_CLUSTER_BIN_1_CORE +PSGC_00000059 GENE_CLUSTER_BIN_1_CORE +PSGC_00000058 GENE_CLUSTER_BIN_1_CORE +PSGC_00000057 GENE_CLUSTER_BIN_1_CORE +PSGC_00000056 GENE_CLUSTER_BIN_1_CORE +PSGC_00000055 GENE_CLUSTER_BIN_1_CORE +PSGC_00000054 GENE_CLUSTER_BIN_1_CORE +PSGC_00000053 GENE_CLUSTER_BIN_1_CORE +PSGC_00000052 GENE_CLUSTER_BIN_1_CORE +PSGC_00000051 GENE_CLUSTER_BIN_1_CORE +PSGC_00000050 GENE_CLUSTER_BIN_1_CORE +PSGC_00000049 GENE_CLUSTER_BIN_1_CORE +PSGC_00000048 GENE_CLUSTER_BIN_1_CORE +PSGC_00000047 GENE_CLUSTER_BIN_1_CORE +PSGC_00000046 GENE_CLUSTER_BIN_1_CORE +PSGC_00000045 GENE_CLUSTER_BIN_1_CORE +PSGC_00000044 GENE_CLUSTER_BIN_1_CORE +PSGC_00000043 GENE_CLUSTER_BIN_1_CORE +PSGC_00000042 GENE_CLUSTER_BIN_1_CORE +PSGC_00000041 GENE_CLUSTER_BIN_1_CORE +PSGC_00000040 GENE_CLUSTER_BIN_1_CORE +PSGC_00000039 GENE_CLUSTER_BIN_1_CORE +PSGC_00000038 GENE_CLUSTER_BIN_1_CORE +PSGC_00000037 GENE_CLUSTER_BIN_1_CORE +PSGC_00000036 GENE_CLUSTER_BIN_1_CORE +PSGC_00000035 GENE_CLUSTER_BIN_1_CORE +PSGC_00000034 GENE_CLUSTER_BIN_1_CORE +PSGC_00000033 GENE_CLUSTER_BIN_1_CORE +PSGC_00000032 GENE_CLUSTER_BIN_1_CORE +PSGC_00000031 GENE_CLUSTER_BIN_1_CORE +PSGC_00000030 GENE_CLUSTER_BIN_1_CORE +PSGC_00000029 GENE_CLUSTER_BIN_1_CORE +PSGC_00000028 GENE_CLUSTER_BIN_1_CORE +PSGC_00000027 GENE_CLUSTER_BIN_1_CORE +PSGC_00000026 GENE_CLUSTER_BIN_1_CORE +PSGC_00000025 GENE_CLUSTER_BIN_1_CORE +PSGC_00000024 GENE_CLUSTER_BIN_1_CORE +PSGC_00000023 GENE_CLUSTER_BIN_1_CORE +PSGC_00000022 GENE_CLUSTER_BIN_1_CORE +PSGC_00000021 GENE_CLUSTER_BIN_1_CORE +PSGC_00000020 GENE_CLUSTER_BIN_1_CORE +PSGC_00000019 GENE_CLUSTER_BIN_1_CORE +PSGC_00000018 GENE_CLUSTER_BIN_1_CORE +PSGC_00000017 GENE_CLUSTER_BIN_1_CORE +PSGC_00000016 GENE_CLUSTER_BIN_1_CORE +PSGC_00000015 GENE_CLUSTER_BIN_1_CORE +PSGC_00000014 GENE_CLUSTER_BIN_1_CORE +PSGC_00000013 GENE_CLUSTER_BIN_1_CORE +PSGC_00000012 GENE_CLUSTER_BIN_1_CORE +PSGC_00000011 GENE_CLUSTER_BIN_1_CORE +PSGC_00000010 GENE_CLUSTER_BIN_1_CORE +PSGC_00000009 GENE_CLUSTER_BIN_1_CORE +PSGC_00000008 GENE_CLUSTER_BIN_1_CORE +PSGC_00000007 GENE_CLUSTER_BIN_1_CORE +PSGC_00000006 GENE_CLUSTER_BIN_1_CORE +PSGC_00000005 GENE_CLUSTER_BIN_1_CORE +PSGC_00000004 GENE_CLUSTER_BIN_1_CORE +PSGC_00000003 GENE_CLUSTER_BIN_1_CORE +PSGC_00000002 GENE_CLUSTER_BIN_1_CORE +PSGC_00000236 GENE_CLUSTER_BIN_2 +PSGC_00000235 GENE_CLUSTER_BIN_2 +PSGC_00000234 GENE_CLUSTER_BIN_2 +PSGC_00000233 GENE_CLUSTER_BIN_2 +PSGC_00000231 GENE_CLUSTER_BIN_2 +PSGC_00000230 GENE_CLUSTER_BIN_2 +PSGC_00000228 GENE_CLUSTER_BIN_2 +PSGC_00000227 GENE_CLUSTER_BIN_2 +PSGC_00000226 GENE_CLUSTER_BIN_2 +PSGC_00000225 GENE_CLUSTER_BIN_2 +PSGC_00000224 GENE_CLUSTER_BIN_2 +PSGC_00000223 GENE_CLUSTER_BIN_2 +PSGC_00000222 GENE_CLUSTER_BIN_2 +PSGC_00000221 GENE_CLUSTER_BIN_2 +PSGC_00000220 GENE_CLUSTER_BIN_2 +PSGC_00000219 GENE_CLUSTER_BIN_2 +PSGC_00000218 GENE_CLUSTER_BIN_2 +PSGC_00000217 GENE_CLUSTER_BIN_2 +PSGC_00000216 GENE_CLUSTER_BIN_2 +PSGC_00000215 GENE_CLUSTER_BIN_2 +PSGC_00000214 GENE_CLUSTER_BIN_2 +PSGC_00000213 GENE_CLUSTER_BIN_2 +PSGC_00000212 GENE_CLUSTER_BIN_2 +PSGC_00000211 GENE_CLUSTER_BIN_2 +PSGC_00000210 GENE_CLUSTER_BIN_2 +PSGC_00000209 GENE_CLUSTER_BIN_2 +PSGC_00000208 GENE_CLUSTER_BIN_2 +PSGC_00000207 GENE_CLUSTER_BIN_2 +PSGC_00000206 GENE_CLUSTER_BIN_2 +PSGC_00000205 GENE_CLUSTER_BIN_2 +PSGC_00000204 GENE_CLUSTER_BIN_2 +PSGC_00000203 GENE_CLUSTER_BIN_2 +PSGC_00000202 GENE_CLUSTER_BIN_2 +PSGC_00000201 GENE_CLUSTER_BIN_2 +PSGC_00000200 GENE_CLUSTER_BIN_2 +PSGC_00000199 GENE_CLUSTER_BIN_2 +PSGC_00000369 GENE_CLUSTER_BIN_3 +PSGC_00000367 GENE_CLUSTER_BIN_3 +PSGC_00000365 GENE_CLUSTER_BIN_3 +PSGC_00000362 GENE_CLUSTER_BIN_3 +PSGC_00000361 GENE_CLUSTER_BIN_3 +PSGC_00000360 GENE_CLUSTER_BIN_3 +PSGC_00000358 GENE_CLUSTER_BIN_3 +PSGC_00000355 GENE_CLUSTER_BIN_3 +PSGC_00000352 GENE_CLUSTER_BIN_3 +PSGC_00000351 GENE_CLUSTER_BIN_3 +PSGC_00000349 GENE_CLUSTER_BIN_3 +PSGC_00000348 GENE_CLUSTER_BIN_3 +PSGC_00000347 GENE_CLUSTER_BIN_3 +PSGC_00000346 GENE_CLUSTER_BIN_3 +PSGC_00000342 GENE_CLUSTER_BIN_3 +PSGC_00000337 GENE_CLUSTER_BIN_3 +PSGC_00000336 GENE_CLUSTER_BIN_3 +PSGC_00000335 GENE_CLUSTER_BIN_3 +PSGC_00000334 GENE_CLUSTER_BIN_3 +PSGC_00000332 GENE_CLUSTER_BIN_3 +PSGC_00000331 GENE_CLUSTER_BIN_3 +PSGC_00000328 GENE_CLUSTER_BIN_3 +PSGC_00000325 GENE_CLUSTER_BIN_3 +PSGC_00000323 GENE_CLUSTER_BIN_3 +PSGC_00000322 GENE_CLUSTER_BIN_3 +PSGC_00000320 GENE_CLUSTER_BIN_3 +PSGC_00000317 GENE_CLUSTER_BIN_3 +PSGC_00000315 GENE_CLUSTER_BIN_3 +PSGC_00000312 GENE_CLUSTER_BIN_3 +PSGC_00000309 GENE_CLUSTER_BIN_3 +PSGC_00000308 GENE_CLUSTER_BIN_3 +PSGC_00000307 GENE_CLUSTER_BIN_3 +PSGC_00000303 GENE_CLUSTER_BIN_3 +PSGC_00000302 GENE_CLUSTER_BIN_3 +PSGC_00000299 GENE_CLUSTER_BIN_3 +PSGC_00000296 GENE_CLUSTER_BIN_3 +PSGC_00000295 GENE_CLUSTER_BIN_3 +PSGC_00000291 GENE_CLUSTER_BIN_3 +PSGC_00000289 GENE_CLUSTER_BIN_3 +PSGC_00000288 GENE_CLUSTER_BIN_3 +PSGC_00000286 GENE_CLUSTER_BIN_3 +PSGC_00000285 GENE_CLUSTER_BIN_3 +PSGC_00000284 GENE_CLUSTER_BIN_3 +PSGC_00000281 GENE_CLUSTER_BIN_3 +PSGC_00000278 GENE_CLUSTER_BIN_3 +PSGC_00000277 GENE_CLUSTER_BIN_3 +PSGC_00000274 GENE_CLUSTER_BIN_3 +PSGC_00000273 GENE_CLUSTER_BIN_3 +PSGC_00000270 GENE_CLUSTER_BIN_3 +PSGC_00000268 GENE_CLUSTER_BIN_3 +PSGC_00000266 GENE_CLUSTER_BIN_3 +PSGC_00000263 GENE_CLUSTER_BIN_3 +PSGC_00000260 GENE_CLUSTER_BIN_3 +PSGC_00000258 GENE_CLUSTER_BIN_3 +PSGC_00000256 GENE_CLUSTER_BIN_3 +PSGC_00000255 GENE_CLUSTER_BIN_3 +PSGC_00000253 GENE_CLUSTER_BIN_3 +PSGC_00000252 GENE_CLUSTER_BIN_3 +PSGC_00000251 GENE_CLUSTER_BIN_3 +PSGC_00000249 GENE_CLUSTER_BIN_3 +PSGC_00000366 GENE_CLUSTER_BIN_3 +PSGC_00000364 GENE_CLUSTER_BIN_3 +PSGC_00000359 GENE_CLUSTER_BIN_3 +PSGC_00000357 GENE_CLUSTER_BIN_3 +PSGC_00000356 GENE_CLUSTER_BIN_3 +PSGC_00000353 GENE_CLUSTER_BIN_3 +PSGC_00000350 GENE_CLUSTER_BIN_3 +PSGC_00000345 GENE_CLUSTER_BIN_3 +PSGC_00000344 GENE_CLUSTER_BIN_3 +PSGC_00000340 GENE_CLUSTER_BIN_3 +PSGC_00000339 GENE_CLUSTER_BIN_3 +PSGC_00000338 GENE_CLUSTER_BIN_3 +PSGC_00000333 GENE_CLUSTER_BIN_3 +PSGC_00000330 GENE_CLUSTER_BIN_3 +PSGC_00000329 GENE_CLUSTER_BIN_3 +PSGC_00000327 GENE_CLUSTER_BIN_3 +PSGC_00000326 GENE_CLUSTER_BIN_3 +PSGC_00000324 GENE_CLUSTER_BIN_3 +PSGC_00000319 GENE_CLUSTER_BIN_3 +PSGC_00000318 GENE_CLUSTER_BIN_3 +PSGC_00000316 GENE_CLUSTER_BIN_3 +PSGC_00000314 GENE_CLUSTER_BIN_3 +PSGC_00000313 GENE_CLUSTER_BIN_3 +PSGC_00000310 GENE_CLUSTER_BIN_3 +PSGC_00000306 GENE_CLUSTER_BIN_3 +PSGC_00000305 GENE_CLUSTER_BIN_3 +PSGC_00000304 GENE_CLUSTER_BIN_3 +PSGC_00000301 GENE_CLUSTER_BIN_3 +PSGC_00000300 GENE_CLUSTER_BIN_3 +PSGC_00000298 GENE_CLUSTER_BIN_3 +PSGC_00000294 GENE_CLUSTER_BIN_3 +PSGC_00000290 GENE_CLUSTER_BIN_3 +PSGC_00000287 GENE_CLUSTER_BIN_3 +PSGC_00000283 GENE_CLUSTER_BIN_3 +PSGC_00000282 GENE_CLUSTER_BIN_3 +PSGC_00000279 GENE_CLUSTER_BIN_3 +PSGC_00000276 GENE_CLUSTER_BIN_3 +PSGC_00000275 GENE_CLUSTER_BIN_3 +PSGC_00000272 GENE_CLUSTER_BIN_3 +PSGC_00000271 GENE_CLUSTER_BIN_3 +PSGC_00000269 GENE_CLUSTER_BIN_3 +PSGC_00000265 GENE_CLUSTER_BIN_3 +PSGC_00000264 GENE_CLUSTER_BIN_3 +PSGC_00000261 GENE_CLUSTER_BIN_3 +PSGC_00000259 GENE_CLUSTER_BIN_3 +PSGC_00000257 GENE_CLUSTER_BIN_3 +PSGC_00000250 GENE_CLUSTER_BIN_3 +PSGC_00000248 GENE_CLUSTER_BIN_3 +PSGC_00000247 GENE_CLUSTER_BIN_3 +PSGC_00000246 GENE_CLUSTER_BIN_3 +PSGC_00000243 GENE_CLUSTER_BIN_3 +PSGC_00000368 GENE_CLUSTER_BIN_3 +PSGC_00000363 GENE_CLUSTER_BIN_3 +PSGC_00000341 GENE_CLUSTER_BIN_3 +PSGC_00000321 GENE_CLUSTER_BIN_3 +PSGC_00000311 GENE_CLUSTER_BIN_3 +PSGC_00000293 GENE_CLUSTER_BIN_3 +PSGC_00000280 GENE_CLUSTER_BIN_3 +PSGC_00000354 GENE_CLUSTER_BIN_3 +PSGC_00000297 GENE_CLUSTER_BIN_3 +PSGC_00000292 GENE_CLUSTER_BIN_3 +PSGC_00000343 GENE_CLUSTER_BIN_3 +PSGC_00000267 GENE_CLUSTER_BIN_3 +PSGC_00000262 GENE_CLUSTER_BIN_3 +PSGC_00000254 GENE_CLUSTER_BIN_3 +PSGC_00000241 GENE_CLUSTER_BIN_3 diff --git a/bin/anvi-pan-genome b/bin/anvi-pan-genome index 166b62aefc..f0417e1467 100755 --- a/bin/anvi-pan-genome +++ b/bin/anvi-pan-genome @@ -57,13 +57,18 @@ if __name__ == '__main__': from the gene caller used to identify genes during the generation of the contigs database).") - groupB = parser.add_argument_group('DEFAULT GENE CLUSTER FORMATION: SEQUENCE SEARCH', "The first step in this workflow is to perform reciprocal sequence search " + groupB = parser.add_argument_group('MODE: SEQUENCE-BASED or STRUCTURE-INFORMED', "Anvi'o can calculate a pangenome based one amino acid sequence homology " + "or based on predicted protein structures by utilizing the protein language model ProstT5 to go from a AA sequence to a 3di " + "to form gene clusters.") + groupB.add_argument(*anvio.A('pan-mode'), **anvio.K('pan-mode', {'required': False})) + + groupC = parser.add_argument_group('DEFAULT GENE CLUSTER FORMATION: SEQUENCE SEARCH', "The first step in this workflow is to perform reciprocal sequence search " "within the gene pool of input genomes. If you don't change anything in this section, things will work fine, and anvi'o will " "use DIAMOND by default to accomplish the search with default parameters anvi'o developers set for you. But please go " "through each parameter anyway to have an idea regarding what is available to you to play with.") - groupB.add_argument('--use-ncbi-blast', default = False, action = 'store_true', help = "This program uses DIAMOND by default (and you should, too), " + groupC.add_argument('--use-ncbi-blast', default = False, action = 'store_true', help = "This program uses DIAMOND by default (and you should, too), " "but if you'd like to use the good ol' blastp by the NCBI instead, it can't stop you.") - groupB.add_argument('--additional-params-for-seq-search', type=str, metavar = "CMD LINE PARAMS", help = "OK. This is very important. While " + groupC.add_argument('--additional-params-for-seq-search', type=str, metavar = "CMD LINE PARAMS", help = "OK. This is very important. While " f"anvi'o has some defaults for whichever approach you choose to use for your sequence search, you can assume full control " f"over what is passed to the search program. Put anything you wish anvi'o to send your search program in double quotes, and " f"they will be passed to the program. If you don't use this parameter, in addition to the additional parameters anvi'o will " @@ -74,11 +79,10 @@ if __name__ == '__main__': f"\"{panops.additional_param_sets_for_sequence_search['diamond']} --sensitive\". DO NOT EVER FORGET THE DOUBLE QUOTES, unless " f"you hate your computer and want to see it melting beforey our eyes.") - - groupB = parser.add_argument_group('DEFAULT GENE CLUSTER FORMATION: RESOLVING NETWORK', "The second step in this workflow is to use the sequence search results " + groupC = parser.add_argument_group('DEFAULT GENE CLUSTER FORMATION: NETWORK RESOLUTION', "The second step in this workflow is to use the sequence search results " "to determine gene clusters. Which is a straightforward step thanks to the MCL algorithm. The following parameters are there " "to tweak this step, even though default choices will work for the vast majority of cases.") - groupB.add_argument('--minbit', type = float, default = 0.5, metavar = "MINBIT", help = "The minimum minbit value. The minbit heuristic \ + groupC.add_argument('--minbit', type = float, default = 0.5, metavar = "MINBIT", help = "The minimum minbit value. The minbit heuristic \ provides a mean to set a to eliminate weak matches between two amino acid sequences. We learned it from ITEP \ (Benedict MN et al, doi:10.1186/1471-2164-15-8), which is a comprehensive analysis workflow for pangenomes, \ and decided to use it in the anvi'o pangenomic workflow, as well. Briefly, If you have two amino acid sequences,\ @@ -86,63 +90,65 @@ if __name__ == '__main__': between two sequences goes to 1 if they are very similar over the entire length of the 'shorter' amino acid sequence,\ and goes to 0 if (1) they match over a very short stretch compared even to the length of the shorter amino acid sequence\ or (2) the match between sequence identity is low. The default is %(default)g.") - groupB.add_argument('--mcl-inflation', type = float, default = 2.0, metavar = "INFLATION", help = "MCL inflation parameter, that defines\ + groupC.add_argument('--mcl-inflation', type = float, default = 2.0, metavar = "INFLATION", help = "MCL inflation parameter, that defines\ the sensitivity of the algorithm during the identification of the gene clusters. More information on this\ parameter and it's effect on cluster granularity is here: (http://micans.org/mcl/man/mclfaq.html#faq7.2).\ The default is %(default)g.") - groupB.add_argument('--min-percent-identity', type = float, default = 0.0, metavar = "PERCENT", help = "Minimum percent identity\ + groupC.add_argument('--min-percent-identity', type = float, default = 0.0, metavar = "PERCENT", help = "Minimum percent identity\ between the two amino acid sequences for them to have an edge for MCL analysis. This value will be used\ to filter hits from Diamond search results. Because percent identity is not a predictor of a good match (since\ it does not communicate many other important factors such as the alignment length between the two sequences and\ its proportion to the entire length of those involved), we suggest you rely on 'minbit' parameter. But you know\ what? Maybe you shouldn't listen to anyone, and experiment on your own! The default is %(default)g percent.") - groupB.add_argument('--min-occurrence', type = int, default = 1, metavar = 'NUM_OCCURRENCE', help = "Do you not want singletons? You don't?\ + groupC.add_argument('--min-occurrence', type = int, default = 1, metavar = 'NUM_OCCURRENCE', help = "Do you not want singletons? You don't?\ Well, this parameter will help you get rid of them (along with doubletons, if you want). Anvi'o will remove\ gene clusters that occur less than the number you set using this parameter from the analysis. The default\ is %(default)d, which means everything will be kept. If you want to remove singletons, set it to 2, if you want to\ remove doubletons as well, set it to 3, and so on.") - groupC = parser.add_argument_group('ALTERNATIVE GENE CLUSTER FORMATION: USER-PROVIDED GENE CLUSTERS', "As an alternative approach, you can provide " + groupD = parser.add_argument_group('ALTERNATIVE GENE CLUSTER FORMATION: USER-PROVIDED GENE CLUSTERS', "As an alternative approach, you can provide " "anvi'o with your own gene cluster affiliations. In this case, anvi'o would not use the default way of computing gene " "clusters de novo, but take your word for which genes go together.") - groupC.add_argument(*anvio.A('gene-clusters-txt'), **anvio.K('gene-clusters-txt')) + groupD.add_argument(*anvio.A('gene-clusters-txt'), **anvio.K('gene-clusters-txt')) - groupD = parser.add_argument_group("GENE CLUSTER CHARACTERIZATION", "Parameters that mostly impact how anvi'o characterizes your gene clusters once " + groupE = parser.add_argument_group("GENE CLUSTER CHARACTERIZATION", "Parameters that mostly impact how anvi'o characterizes your gene clusters once " "they are identified. Such as aligning within gene cluster sequences, computing homogeneity indices for them, etc. " "Important stuff.") - groupD.add_argument('--skip-alignments', default = False, action = 'store_true', help = "By default, anvi'o attempts to align amino acid\ + groupE.add_argument('--skip-alignments', default = False, action = 'store_true', help = "By default, anvi'o attempts to align amino acid\ sequences in each gene cluster using multiple sequence alignment via muscle. You can use this flag to skip\ that step and be upset later.") - groupD.add_argument(*anvio.A('align-with'), **anvio.K('align-with')) - groupD.add_argument('--skip-homogeneity', default=False, action='store_true', dest='skip_homogeneity',help="By default, anvi'o attempts to calculate homogeneity\ + groupE.add_argument(*anvio.A('align-with'), **anvio.K('align-with')) + groupE.add_argument('--skip-homogeneity', default=False, action='store_true', dest='skip_homogeneity',help="By default, anvi'o attempts to calculate homogeneity\ values for every gene cluster, given that they are aligned. You can use this flag to have anvi'o skip\ homogeneity calculations. Anvi'o will ignore this flag if you decide to skip alignments") - groupD.add_argument('--quick-homogeneity', default=False, action='store_true', dest='quick_homogeneity',help="By default, anvi'o will use a homogeneity\ + groupE.add_argument('--quick-homogeneity', default=False, action='store_true', dest='quick_homogeneity',help="By default, anvi'o will use a homogeneity\ algorithm that checks for horizontal and vertical geometric homogeneity (along with functional). With this\ flag, you can tell anvi'o to skip horizontal geometric homogeneity calculations. It will be less accurate but quicker.\ Anvi'o will ignore this flag if you skip homogeneity calculations or alignments all together.") - groupE = parser.add_argument_group("OTHERS", "Sweet parameters of convenience.") - groupE.add_argument(*anvio.A('project-name'), **anvio.K('project-name')) - groupE.add_argument(*anvio.A('description'), **anvio.K('description')) - groupE.add_argument(*anvio.A('output-dir'), **anvio.K('output-dir', {'metavar':'PAN_DB_DIR'})) - groupE.add_argument(*anvio.A('overwrite-output-destinations'), **anvio.K('overwrite-output-destinations')) - groupE.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) - - groupF = parser.add_argument_group("ORGANIZING GENE CLUSTERs", "These are stuff that will change the clustering dendrogram of your gene clusters.") - groupF.add_argument(*anvio.A('skip-hierarchical-clustering'), **anvio.K('skip-hierarchical-clustering', {'help': "Anvi'o attempts\ + groupF = parser.add_argument_group("OTHERS", "Sweet parameters of convenience.") + groupF.add_argument(*anvio.A('project-name'), **anvio.K('project-name')) + groupF.add_argument(*anvio.A('description'), **anvio.K('description')) + groupF.add_argument(*anvio.A('output-dir'), **anvio.K('output-dir', {'metavar':'PAN_DB_DIR'})) + groupF.add_argument(*anvio.A('overwrite-output-destinations'), **anvio.K('overwrite-output-destinations')) + groupF.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) + groupF.add_argument(*anvio.A('prostt5-data-dir'), **anvio.K('prostt5-data-dir')) + groupF.add_argument(*anvio.A('foldseek-search-results'), **anvio.K('foldseek-search-results')) + + groupG = parser.add_argument_group("ORGANIZING GENE CLUSTERs", "These are stuff that will change the clustering dendrogram of your gene clusters.") + groupG.add_argument(*anvio.A('skip-hierarchical-clustering'), **anvio.K('skip-hierarchical-clustering', {'help': "Anvi'o attempts\ to generate a hierarchical clustering of your gene clusters once it identifies them so you can use\ `anvi-display-pan` to play with it. But if you want to skip this step, this is your flag."})) - groupF.add_argument(*anvio.A('enforce-hierarchical-clustering'), **anvio.K('enforce-hierarchical-clustering', {'help': "If you\ + groupG.add_argument(*anvio.A('enforce-hierarchical-clustering'), **anvio.K('enforce-hierarchical-clustering', {'help': "If you\ want anvi'o to try to generate a hierarchical clustering of your gene clusters even if the number of gene clusters exceeds\ its suggested limit for hierarchical clustering, you can use this flag to enforce it. Are you are a\ rebel of some sorts? Or did computers make you upset? Express your anger towards machine using this\ flag."})) - groupF.add_argument(*anvio.A('distance'), **anvio.K('distance', {'default': None, 'help': + groupG.add_argument(*anvio.A('distance'), **anvio.K('distance', {'default': None, 'help': 'The distance metric for the clustering of gene clusters. If you do not use this flag,\ the default distance metric will be used for each clustering configuration\ which is "%s".' % constants.distance_metric_default})) - groupF.add_argument(*anvio.A('linkage'), **anvio.K('linkage', {'default': None, 'help': + groupG.add_argument(*anvio.A('linkage'), **anvio.K('linkage', {'default': None, 'help': 'The same story with the `--distance`, except, the system default for this one\ is %s.' % constants.linkage_method_default})) @@ -153,6 +159,10 @@ if __name__ == '__main__': run.warning('If you publish results from this workflow, please do not forget to cite DIAMOND ' '(doi:10.1038/nmeth.3176), unless you use it with --use-ncbi-blast flag, and MCL ' '(http://micans.org/mcl/ and doi:10.1007/978-1-61779-361-5_15)', lc = 'yellow') + + if args.pan_mode == 'structure-informed': + run.warning('If you publish results from this workflow, please do not forget to cite Foldseek ' + '(https://rdcu.be/d3KMW) and doi:10.1038/s41587-023-01773-0', lc = 'yellow') try: pan = panops.Pangenome(args, run, progress) diff --git a/bin/anvi-self-test b/bin/anvi-self-test index 596da89fe1..8b3582a19c 100755 --- a/bin/anvi-self-test +++ b/bin/anvi-self-test @@ -46,6 +46,7 @@ tests = {'mini' : ['run_component_tests_for_minimal_metagenomic 'workflow-sra-download' : ['run_workflow_tests_for_sra_download.sh'], 'run-cazymes' : ['run_component_tests_for_cazymes.sh'], 'export-locus' : ['run_component_tests_for_export_locus.sh'], + 'structure-informed-pangenomics' : ['run_component_tests_for_structure_informed_pangenomics.sh'], } run = terminal.Run() diff --git a/bin/anvi-setup-prostt5 b/bin/anvi-setup-prostt5 new file mode 100755 index 0000000000..fa669a78e4 --- /dev/null +++ b/bin/anvi-setup-prostt5 @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import sys +import anvio + +from anvio.drivers.foldseek import Prostt5SetupWeight +from anvio.errors import ConfigError, FilesNPathsError + + +__copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__maintainer__ = "Metehan Sever" +__email__ = "metehaansever@gmail.com" +__provides__ = ['prostt5-weights'] +__description__ = "Setup PROSTT5 weights" + +if __name__ == '__main__': + from anvio.argparse import ArgumentParser + + parser = ArgumentParser(description=__description__) + parser.add_argument(*anvio.A('prostt5-data-dir'), **anvio.K('prostt5-data-dir')) + parser.add_argument(*anvio.A('reset'), **anvio.K('reset')) + + args = parser.get_args(parser) + + try: + setup = Prostt5SetupWeight(args) + setup.setup() + + except ConfigError as e: + print(e) + sys.exit(-1) + except FilesNPathsError as e: + print(e) + sys.exit(-1) diff --git a/bin/anvi-summarize b/bin/anvi-summarize index 06cae813d7..2415f5b469 100755 --- a/bin/anvi-summarize +++ b/bin/anvi-summarize @@ -103,6 +103,7 @@ if __name__ == '__main__': groupC.add_argument('--report-aa-seqs-for-gene-calls', default=False, action='store_true', help="You can use this flag if\ you would like amino acid AND dna sequences for your gene calls in the genes output\ file. By default, only dna sequences are reported.") + groupC.add_argument(*anvio.A('init-pan-mode'), **anvio.K('init-pan-mode')) groupD.add_argument( *anvio.A('report-DNA-sequences'),