From 17968f93c414e19b3b0678e0b94b0fd3a00e9fa8 Mon Sep 17 00:00:00 2001 From: Chase Clark <18691127+chasemc@users.noreply.github.com> Date: Thu, 21 Dec 2023 12:04:15 -0600 Subject: [PATCH] Search (#79) * feat: adds sg class parent to assembly class; gbk outputs * fix: no mstart when scatter=T * fix: async +threading option * feat: bgc search Also many other updates/functionality --- Take an input BGC.md | 2 +- environment.yml | 2 +- new_search.py | 88 ------ new_search2.py | 88 ------ pyproject.toml | 1 + socialgene/base/compare_protein.py | 2 +- socialgene/base/molbio.py | 207 ++++++++++++-- socialgene/base/socialgene.py | 113 ++++---- socialgene/cli/search/gene_cluster.py | 255 ++++++++++++++++++ socialgene/clustermap/serialize.py | 61 +++-- .../hmm => compare_gene_clusters}/__init__.py | 0 .../compare_gene_clusters.py | 30 +++ socialgene/compare_proteins/base.py | 106 ++++++++ socialgene/compare_proteins/base_class.py | 42 --- socialgene/compare_proteins/diamond.py | 140 ++++++++++ socialgene/compare_proteins/hmm/hmmer.py | 62 ----- .../{hmm/scoring.py => hmm_scoring.py} | 16 +- socialgene/compare_proteins/hmmer.py | 68 +++++ socialgene/compare_proteins/mmseqs.py | 166 ++++++++++++ socialgene/compare_proteins/mmseqs_WIP.py | 98 ------- socialgene/hmm/hmmer.py | 2 +- socialgene/mmseqs/__init__.py | 0 socialgene/mmseqs/create_database.py | 63 ----- socialgene/mmseqs/index_database.py | 51 ---- socialgene/mmseqs/search.py | 106 -------- socialgene/mmseqs/subset_database.py | 55 ---- socialgene/parsers/fasta.py | 2 +- socialgene/parsers/genbank.py | 2 +- socialgene/parsers/hmmmodel.py | 3 +- socialgene/scoring/__init__.py | 0 socialgene/search/base.py | 188 +++++++++---- socialgene/search/hmmer.py | 157 +++++++++-- socialgene/utils/logging.py | 6 +- socialgene/utils/ncbi_ftp.py | 2 +- socialgene/utils/run_subprocess.py | 30 ++- .../python/classes/test_mibig_fasta_parser.py | 114 ++++---- tests/python/classes/test_mibig_gbk_parser.py | 11 +- tests/python/classes/test_socialgene.py | 2 +- ...est_export_protein_loci_assembly_tables.py | 5 +- .../compare_proteins/compare_proteins.py | 48 ++++ tests/python/compare_proteins/hmm/hmmer.py | 16 +- .../compare_proteins/hmm/test_scoring.py | 68 +++-- .../data/compare_proteins/BGC0001848.pickle | Bin 0 -> 48364 bytes .../data/compare_proteins/BGC0001850.pickle | Bin 0 -> 19983 bytes .../compare_proteins/test_CompareDomains.csv | 33 +++ .../compare_proteins/test_DiamondBlastp.csv | 27 ++ .../test_MMseqsEasySearch.csv | 25 ++ tests/python/parsers/test_fasta.py | 11 +- tests/python/test_autogen.py | 26 +- 49 files changed, 1607 insertions(+), 993 deletions(-) delete mode 100644 new_search.py delete mode 100644 new_search2.py create mode 100644 socialgene/cli/search/gene_cluster.py rename socialgene/{compare_proteins/hmm => compare_gene_clusters}/__init__.py (100%) create mode 100644 socialgene/compare_gene_clusters/compare_gene_clusters.py create mode 100644 socialgene/compare_proteins/base.py delete mode 100644 socialgene/compare_proteins/base_class.py create mode 100644 socialgene/compare_proteins/diamond.py delete mode 100644 socialgene/compare_proteins/hmm/hmmer.py rename socialgene/compare_proteins/{hmm/scoring.py => hmm_scoring.py} (91%) create mode 100644 socialgene/compare_proteins/hmmer.py create mode 100644 socialgene/compare_proteins/mmseqs.py delete mode 100644 socialgene/compare_proteins/mmseqs_WIP.py delete mode 100644 socialgene/mmseqs/__init__.py delete mode 100644 socialgene/mmseqs/create_database.py delete mode 100644 socialgene/mmseqs/index_database.py delete mode 100644 socialgene/mmseqs/search.py delete mode 100644 socialgene/mmseqs/subset_database.py delete mode 100644 socialgene/scoring/__init__.py create mode 100644 tests/python/compare_proteins/compare_proteins.py create mode 100644 tests/python/data/compare_proteins/BGC0001848.pickle create mode 100644 tests/python/data/compare_proteins/BGC0001850.pickle create mode 100644 tests/python/data/compare_proteins/test_CompareDomains.csv create mode 100644 tests/python/data/compare_proteins/test_DiamondBlastp.csv create mode 100644 tests/python/data/compare_proteins/test_MMseqsEasySearch.csv diff --git a/Take an input BGC.md b/Take an input BGC.md index a57c1ab4..ad7781bc 100644 --- a/Take an input BGC.md +++ b/Take an input BGC.md @@ -11,7 +11,7 @@ - max_outdegree (int): HMM model annotations with an outdegree higher than this will be dropped - scatter (bool, optional): Choose a random subset of proteins to search that are spread across the length of the input BGC. Defaults to False. - bypass (List[str], optional): List of locus tags that will bypass filtering. This is the ID found in a GenBank file "/locus_tag=" field. Defaults to None. - - bypass_eid (List[str], optional): Less preferred than `bypass`. List of external protein IDs that will bypass filtering. This is the ID found in a GenBank file "/protein_id=" field. Defaults to None. + - protein_id_bypass_list (List[str], optional): Less preferred than `bypass`. List of external protein IDs that will bypass filtering. This is the ID found in a GenBank file "/protein_id=" field. Defaults to None. 7. Search the database for all proteins that have the same HMM model annotations as the input BGC proteins - Output from database is a data frame with columns: ['assembly_uid', 'nucleotide_uid', 'target', 'n_start', 'n_end', 'query'] 8. The initial hits output is filtered based on the following criteria: diff --git a/environment.yml b/environment.yml index cd2fdaea..b2c42bbc 100644 --- a/environment.yml +++ b/environment.yml @@ -6,7 +6,7 @@ channels: - defaults dependencies: - - conda-forge::python==3.10 + - conda-forge::python==3.12 - conda-forge::pip>=23.1.2 - conda-forge::biopython>=1.79 - conda-forge::numpy diff --git a/new_search.py b/new_search.py deleted file mode 100644 index 82ed7502..00000000 --- a/new_search.py +++ /dev/null @@ -1,88 +0,0 @@ -# The code is performing a series of operations using the SocialGene package and the SearchDomains -# class from the socialgene.search.hmmer module. Here is a breakdown of what the code is doing: - -from socialgene.base.socialgene import SocialGene -from socialgene.clustermap.serialize import SerializeToClustermap -from socialgene.search.hmmer import SearchDomains - -input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001646.gbk" -# input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001848.gbk" -hmm_dir = None -# hmm_dir = "/home/chase/Downloads/meh/socialgene_per_run/hmm_cache" -hmm_dir = "/home/chase/Downloads/para/socialgene_per_run/hmm_cache" - -a = SocialGene() -a.parse(input_gbk_path) -len(a.proteins) - -search_object = SearchDomains( - gbk_path=input_gbk_path, - hmm_dir=hmm_dir, - use_neo4j_precalc=True, - assemblies_must_have_x_matches=0.4, - nucleotide_sequences_must_have_x_matches=0.4, - gene_clusters_must_have_x_matches=0.2, - break_bgc_on_gap_of=10000, - target_bgc_padding=15000, -) -self = search_object - -search_object.outdegree_table - - -search_object.prioritize_input_proteins( - max_domains_per_protein=None, - max_outdegree=None, - max_query_proteins=None, - scatter=False, - # loci=["MicB006_2899"] - # bypass_eid=["AXA20096.1"], -) - -search_object.outdegree_table - -# TODO frac is redundant with the outdegree table -search_object.search(only_culture_collection=False, frac=0.75) - - -search_object.filter() -search_object.label_clusters() - - -search_object._bgc_regions_to_sg_object(self._primary_bgc_regions()) - -_ = self.sg_object.annotate_proteins_with_neo4j( - protein_uids=None, annotate_all=True, progress=False -) - - -df = self._primary_bgc_regions() - - -for i, row in df.iterrows(): - self.sg_object.assemblies[row["assembly_uid"]].get_locus_by_uid( - row["nucleotide_uid"] - ).add_bgcs_by_start_end( - start=row["n_start"], - end=row["n_end"], - uid=row["cluster"], - ) - - -temp = self.input_assembly.loci[self.input_bgc_id] - -temp.add_bgcs_by_feature(features=temp.features) - - -self._create_links() -self._choose_group() - - -z = SerializeToClustermap( - sg_object=self.sg_object, - bgc_order=list(self.sg_object.assemblies.keys()), - link_df=self.link_df, - group_df=self.group_df, -) - -z.write("/home/chase/Downloads/clinker-master(2)/clinker-master/clinker/plot/data.json") diff --git a/new_search2.py b/new_search2.py deleted file mode 100644 index da1a1ee9..00000000 --- a/new_search2.py +++ /dev/null @@ -1,88 +0,0 @@ -import time - -from socialgene.clustermap.serialize import SerializeToClustermap -from socialgene.search.hmmer import SearchDomains - -start_time = time.time() -# input_gbk_path = "/home/chase/Downloads/para/a/GCF_002362315.1_ASM236231v1_genomic.gbff.gz" -input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001646.gbk" -hmm_dir = None -# hmm_dir = "/home/chase/Downloads/meh/socialgene_per_run/hmm_cache" -hmm_dir = "/home/chase/Downloads/para/socialgene_per_run/hmm_cache" - - -search_object = SearchDomains( - gbk_path=input_gbk_path, - hmm_dir=hmm_dir, - use_neo4j_precalc=True, - assemblies_must_have_x_matches=0.6, - nucleotide_sequences_must_have_x_matches=0.6, - gene_clusters_must_have_x_matches=0.6, - break_bgc_on_gap_of=10000, - target_bgc_padding=15000, -) - - -# search_object.outdegree_table - - -search_object.prioritize_input_proteins( - max_domains_per_protein=2, - max_outdegree=100000, - max_query_proteins=5, - scatter=False, - # bypass_locus=["MicB006_2899"] - # bypass_pid=["AXA20096.1"], -) - -# search_object.outdegree_table - -# TODO frac is redundant with the outdegree table -search_object.search(only_culture_collection=False, frac=0.75) - - -search_object.filter() -search_object.label_clusters() - - -search_object._bgc_regions_to_sg_object(search_object._primary_bgc_regions()) - -_ = search_object.sg_object.annotate_proteins_with_neo4j( - protein_uids=None, annotate_all=True, progress=False -) - - -df = search_object._primary_bgc_regions() - - -for i, row in df.iterrows(): - search_object.sg_object.assemblies[row["assembly_uid"]].get_locus_by_uid( - row["nucleotide_uid"] - ).add_bgcs_by_start_end( - start=row["n_start"], - end=row["n_end"], - uid=row["cluster"], - ) - - -temp = search_object.input_assembly.loci[search_object.input_bgc_id] -temp.add_bgcs_by_feature(features=temp.features) - -search_object._create_links() -search_object._choose_group() - - -z = SerializeToClustermap( - sg_object=search_object.sg_object, - bgc_order=list(search_object.sg_object.assemblies.keys()), - link_df=search_object.link_df, - group_df=search_object.group_df, -) - -z.write("/home/chase/Downloads/clinker-master(2)/clinker-master/clinker/plot/data.json") -print("--- %s seconds ---" % (time.time() - start_time)) - -# search_object.rich_table(search_object.user_friendly_hit_df()) - - -# self = search_object diff --git a/pyproject.toml b/pyproject.toml index aa4f907f..280bf8b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ sg_get_goterms = "socialgene.utils.goterms:main" # search sg_mm_create = "socialgene.mmseqs.create_database:main" sg_mm_search = "socialgene.mmseqs.search:main" +sg_search_gc = "socialgene.cli.search.gene_cluster:main" # Modify database sgdb_import_classyfire = "socialgene.dbmodifiers.classyfire.import:main" diff --git a/socialgene/base/compare_protein.py b/socialgene/base/compare_protein.py index 3b981974..a9fc9f85 100644 --- a/socialgene/base/compare_protein.py +++ b/socialgene/base/compare_protein.py @@ -3,7 +3,7 @@ import pandas as pd -from socialgene.compare_proteins.hmm.scoring import mod_score +from socialgene.compare_proteins.hmm_scoring import mod_score from socialgene.neo4j.neo4j import Neo4jQuery from socialgene.utils.logging import log diff --git a/socialgene/base/molbio.py b/socialgene/base/molbio.py index 089d8db8..2fa97509 100644 --- a/socialgene/base/molbio.py +++ b/socialgene/base/molbio.py @@ -1,6 +1,11 @@ from collections import OrderedDict from uuid import uuid4 +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqFeature import FeatureLocation, SeqFeature +from Bio.SeqRecord import SeqRecord + import socialgene.hashing.hashing as hasher from socialgene.config import env_vars from socialgene.neo4j.neo4j import GraphDriver @@ -406,6 +411,14 @@ def __init__( self.external_id = external_id self.domains = domains if domains is not None else set() + def __eq__(self, other): # pragma: no cover + if not isinstance(other, type(self)): + return NotImplemented + return self.uid == other.uid + + def __hash__(self): + return hash(self.uid) + def all_attributes(self): return { s: getattr(self, s) for s in sorted(self.__slots__) if hasattr(self, s) @@ -438,6 +451,21 @@ def add_domains_from_neo4j(self): except Exception: log.debug(f"Error trying to retrieve domains for {self.uid}") + def add_sequences_from_neo4j(self): + try: + with GraphDriver() as db: + for i in db.run( + """ + MATCH (p1:protein {uid: $uid}) + RETURN p1.sequence as sequence + """, + uid=str(self.uid), + ): + if x := i.value(): + self.sequence = x + except Exception: + log.debug(f"Error trying to retrieve sequences for {self.uid}") + @property def domain_list_sorted_by_mean_envelope_position(self): # (ie can't sort a set()) @@ -488,7 +516,7 @@ class Feature(Location): """Container class for describing a feature on a locus""" __slots__ = [ - "parent_object", + "parent", "uid", "external_id", "type", @@ -506,11 +534,12 @@ class Feature(Location): "frameshifted", "too_short_partial_abutting_assembly_gap", "incomplete", + "protein", ] def __init__( self, - parent_object=None, + parent=None, uid: str = None, external_id: str = None, type: str = None, @@ -570,7 +599,7 @@ def __init__( incomplete: A boolean flag indicating whether the feature is incomplete. """ super().__init__(**kwargs) - self.parent_object = parent_object + self.parent = parent self.uid = uid self.external_id = external_id self.type = type @@ -590,6 +619,21 @@ def __init__( too_short_partial_abutting_assembly_gap ) self.incomplete = incomplete + if self.feature_is_protein(): + sg = None + current_object = self + for i in range(1, 100): + if ( + f"{current_object.__class__.__module__}.{current_object.__class__.__name__}" + == "socialgene.base.socialgene.SocialGene" + ): + sg = current_object + break + else: + if hasattr(current_object, "parent"): + current_object = current_object.parent + if sg: + self.protein = Protein(uid=self.uid) def all_attributes(self): return {s: getattr(self, s) for s in sorted(self.__slots__) if hasattr(self, s)} @@ -630,13 +674,13 @@ def __lt__(self, other): class FeatureCollection: __slots__ = [ - "parent_object", + "parent", "features", ] def add_feature(self, **kwargs): """Add a feature to a locus""" - self.features.add(Feature(parent_object=self, **kwargs)) + self.features.add(Feature(parent=self, **kwargs)) def all_attributes(self): return {s: getattr(self, s) for s in sorted(self.__slots__) if hasattr(self, s)} @@ -659,12 +703,46 @@ def get_feature_by_uid(self, uid): if v.uid == uid: return v + @property + def proteins(self): + sg = None + current_object = self + for i in range(1, 100): + if ( + str(type(current_object)) + == "" + ): + sg = current_object + break + else: + current_object = current_object.parent + return { + i: sg.proteins.get(i, None) + for i in {i.uid for i in self.features} + if i in sg.proteins + } + + @property + def protein_iter(self): + for i in self.proteins.values(): + yield i + + @property + def fasta_string_defline_uid(self): + for v in self.proteins.values(): + yield f">{v.uid}\n{v.sequence}\n" + + @property + def fasta_string_defline_external_id(self): + for v in self.proteins.values(): + yield f">{v.external_id}\n{v.sequence}\n" + class Locus(FeatureCollection): """Container holding a set() of genomic features""" __slots__ = [ - "parent_object", + "parent", "features", "metadata", "external_id", @@ -672,9 +750,9 @@ class Locus(FeatureCollection): "gene_clusters", ] - def __init__(self, parent_object=None, external_id=None): + def __init__(self, parent=None, external_id=None): super().__init__() - self.parent_object = parent_object + self.parent = parent self.features = set() self.metadata = LocusAssemblyMetadata() self.external_id = external_id @@ -682,25 +760,63 @@ def __init__(self, parent_object=None, external_id=None): self.gene_clusters = list() def calc_uid(self): - return hasher.hasher(f"{self.parent_object.uid}___{self.external_id}") + return hasher.hasher(f"{self.parent.uid}___{self.external_id}") def add_bgcs_by_feature(self, features, **kwargs): if not all([isinstance(i, Feature) for i in features]): raise ValueError( f"All features must be of type Feature, not {[type(i) for i in features if not isinstance(i, Feature)]}" ) - self.gene_clusters.append(GeneCluster(features, parent_object=self, **kwargs)) + self.gene_clusters.append(GeneCluster(features, parent=self, **kwargs)) def add_bgcs_by_start_end(self, start, end, **kwargs): features = {i for i in self.features if i.start >= start and i.end <= end} if features: self.add_bgcs_by_feature(features=features, **kwargs) + def write_genbank(self, outpath, start=None, end=None): + record = SeqRecord( + Seq(""), + id=self.external_id, + name=self.external_id, + description="A GenBank file generated by SocialGene.", + dbxrefs=[f"Assembly:{self.parent.uid}"], + ) + # Add annotation + for feature in self.features_sorted_by_midpoint: + if start: + if int(feature.start) < int(start): + continue + if end: + if int(feature.end) > int(end): + continue + biofeat = SeqFeature( + FeatureLocation( + start=feature.start, + end=feature.end, + strand=feature.strand, + ), + type=feature.type, + qualifiers={ + k: v + for k, v in feature.all_attributes().items() + if v and k != "parent" + } + | {"translation": self.parent.parent.proteins[feature.uid].sequence}, + ) + record.features.append(biofeat) + record.annotations["molecule_type"] = "DNA" + SeqIO.write( + record, + outpath, + "genbank", + ) + class GeneCluster(FeatureCollection): - def __init__(self, features, parent_object=None, uid=None, tool=None, **kwargs): + def __init__(self, features, parent=None, uid=None, tool=None, **kwargs): super().__init__() - self.parent_object = parent_object + self.parent = parent self.features = features self.uid = uid # self.tool; e.g.antismash, gecco, etc @@ -708,6 +824,38 @@ def __init__(self, features, parent_object=None, uid=None, tool=None, **kwargs): # flexible attributes self.__dict__.update(kwargs) + def write_genbank(self, outpath, sg): + record = SeqRecord( + Seq(""), + id=self.parent.external_id, + name=self.parent.external_id, + description="A GenBank file generated by SocialGene.", + dbxrefs=[f"Assembly:{self.parent.uid}"], + ) + # Add annotation + for feature in self.features: + biofeat = SeqFeature( + FeatureLocation( + start=feature.start, + end=feature.end, + strand=feature.strand, + ), + type=feature.type, + qualifiers={ + k: v + for k, v in feature.all_attributes().items() + if v and k != "parent" + } + | {"translation": sg.proteins[feature.uid].sequence}, + ) + record.features.append(biofeat) + record.annotations["molecule_type"] = "DNA" + SeqIO.write( + record, + outpath, + "genbank", + ) + class Taxonomy: "Class is a reserved word so just underscore all ranks to be consistent" @@ -747,10 +895,19 @@ def __init__( class Assembly: """Container class holding a dictionary of loci (ie genes/proteins)""" - __slots__ = ["loci", "taxid", "metadata", "uid", "taxonomy", "name"] + __slots__ = [ + "parent", + "loci", + "taxid", + "metadata", + "uid", + "taxonomy", + "name", + ] - def __init__(self, uid): + def __init__(self, uid, parent=None): super().__init__() + self.parent = parent self.uid = uid self.loci = {} self.taxid = None @@ -774,7 +931,7 @@ def add_locus(self, external_id: str = None): if external_id is None: external_id = str(uuid4()) if external_id not in self.loci: - self.loci[external_id] = Locus(parent_object=self, external_id=external_id) + self.loci[external_id] = Locus(parent=self, external_id=external_id) else: log.debug(f"{external_id} already present") @@ -814,6 +971,22 @@ def get_locus_by_uid(self, uid): if v.uid == uid: return v + @property + def gene_clusters(self): + for locus in self.loci.values(): + for gene_cluster in locus.gene_clusters: + yield gene_cluster + + @property + def fasta_string_defline_uid(self): + for v in self.loci.values(): + yield v.fasta_string_defline_uid + + @property + def fasta_string_defline_external_id(self): + for v in self.loci.values(): + yield v.fasta_string_defline_external_id + class Molbio: """Class for inheriting by SocialGene()""" @@ -853,7 +1026,7 @@ def add_protein( if return_uid: return temp_protein.uid - def add_assembly(self, uid: str = None): + def add_assembly(self, uid: str = None, parent=None): """Add an assembly to a SocialGene object Args: @@ -862,6 +1035,6 @@ def add_assembly(self, uid: str = None): if uid is None: uid = str(uuid4()) if uid not in self.assemblies: - self.assemblies[uid] = Assembly(uid) + self.assemblies[uid] = Assembly(uid=uid, parent=parent) else: log.debug(f"{uid} already present") diff --git a/socialgene/base/socialgene.py b/socialgene/base/socialgene.py index 45274728..90ae4779 100644 --- a/socialgene/base/socialgene.py +++ b/socialgene/base/socialgene.py @@ -16,7 +16,6 @@ from socialgene.base.molbio import Molbio from socialgene.clustermap.serialize import SerializeToClustermap from socialgene.hmm.hmmer import HMMER -from socialgene.mmseqs.search import search as mmseqs_search from socialgene.neo4j.neo4j import GraphDriver, Neo4jQuery from socialgene.neo4j.search.basic import search_protein_hash from socialgene.parsers.hmmer_parser import HmmerParser @@ -57,6 +56,11 @@ def get_all_gene_clusters(self): for k in j.gene_clusters: yield k + @property + def protein_iter(self): + for i in self.proteins.values(): + yield i + ######################################## # Filter ######################################## @@ -148,27 +152,6 @@ def annotate_proteins_with_neo4j( self.get_protein_domains_from_db(protein_id_list=protein_id_list) return search_result - def search_with_mmseqs(self, target_database, argstring): - with tempfile.NamedTemporaryFile() as temp_path: - self.write_fasta(temp_path.name) - self.mmseqs_results = mmseqs_search( - fasta_path=temp_path.name, - target_database=target_database, - argstring=argstring, - ) - - def compare_to_another_sg_object(sg1, sg2, argstring=""): - with tempfile.TemporaryDirectory() as tmpdirname: - sg1fa_path = Path(tmpdirname, "sg1.faa") - sg2fa_path = Path(tmpdirname, "sg2.faa") - sg1.write_fasta(sg1fa_path) - sg2.write_fasta(sg2fa_path) - return mmseqs_search( - fasta_path=str(sg1fa_path), - target_database=str(sg2fa_path), - argstring=argstring, - ) - def annotate( self, use_neo4j_precalc: bool = False, neo4j_chunk_size: int = 1000, **kwargs ): @@ -217,7 +200,7 @@ def annotate_proteins_with_hmmscan( cpus = 1 # create a list of proteins as FASTA temp1 = [ - self.single_protein_to_fasta(i) + self.proteins[i].fasta_string_defline_uid for i in protein_id_list if self.proteins[i].sequence ] @@ -231,18 +214,35 @@ def annotate_proteins_with_hmmscan( input="\n".join(temp1).encode(), domtblout_path=temp_path.name, overwrite=True, + cpus=cpus, **kwargs, ) # parse the resulting domtblout file, saving results to the class proteins/domains self.parse_hmmout(temp_path.name, hmmsearch_or_hmmscan="hmmscan") + def add_sequences_from_neo4j(self): + try: + with GraphDriver() as db: + for i in db.run( + """ + MATCH (p1:protein) + WHERE p1.uid in $uid + RETURN p1.uid as uid, p1.sequence as sequence + """, + uid=[i.uid for i in self.protein_iter], + ): + if i["uid"] in self.proteins: + self.proteins[i["uid"]].sequence = i["sequence"] + except Exception: + log.debug(f"Error trying to retrieve domains for {self.uid}") + def hydrate_from_proteins(self): """Given a SocialGene object with proteins, retrieve from a running Neo4j database all locus and asssembly info for those proteins""" for result in Neo4jQuery.query_neo4j( cypher_name="retrieve_protein_locations", param=list(self.proteins.keys()), ): - self.add_assembly(result["assembly"]) + self.add_assembly(uid=result["assembly"], parent=self) for locus in result["loci"]: _ = self.assemblies[result["assembly"]].add_locus( external_id=locus["locus"] @@ -275,7 +275,7 @@ def fill_given_locus_range(self, locus_uid, start, end): raise ValueError("No assembly found in database") else: assembly_uid = assembly_uid.value() - self.add_assembly(assembly_uid) + self.add_assembly(uid=assembly_uid, parent=self) with GraphDriver() as db: res = db.run( """ @@ -401,49 +401,42 @@ def filter_proteins(self, hash_list: List): """ return ((k, v) for k, v in self.proteins.items() if k in hash_list) + @property + def fasta_string_defline_uid(self): + for v in self.proteins.values(): + yield f">{v.uid}\n{v.sequence}\n" + + @property + def fasta_string_defline_external_id(self): + for v in self.proteins.values(): + yield f">{v.external_id}\n{v.sequence}\n" + def write_fasta( self, outpath, - hash_list: List = None, external_id: bool = False, **kwargs, ): - """Write proteins to a FASTA file + """Write all proteins to a FASTA file Args: outpath (str): path of file that FASTA entries will be appended to - hash_list (List, optional): hash id of the protein(s) to export. Defaults to None. external_id (bool, optional): Write protein identifiers as the hash (True) or the original identifier (False). Defaults to False. **kwargs: see print(open_write.__doc__) """ with open_write(filepath=outpath, **kwargs) as handle: counter = 0 - if hash_list: - temp_iter = self.filter_proteins(hash_list) + if external_id: + fasta_gen = self.fasta_string_defline_external_id else: - temp_iter = self.proteins.items() - for k, v in temp_iter: - if v.sequence is not None: - counter += 1 - if external_id: - handle.writelines(v.fasta_string_defline_external_id) - else: - handle.writelines(v.fasta_string_defline_uid) + fasta_gen = self.fasta_string_defline_uid + for i in fasta_gen: + counter += 1 + handle.writelines(i) log.info(f"Wrote {str(counter)} proteins to {outpath}") - def single_protein_to_fasta(self, uid): - """Create FASTA strings - - Args: - uid (str): hash id of protein to export - - Returns: - str: fasta string - """ - return f">{self.proteins[uid].uid}\n{self.proteins[uid].sequence}" - def write_n_fasta(self, outdir, n_splits=1, **kwargs): """ The function `write_n_fasta` exports protein sequences split into multiple fasta files. @@ -457,6 +450,7 @@ def write_n_fasta(self, outdir, n_splits=1, **kwargs): than. Defaults to 1 """ + # this can be done with itertools.batched in python 3.12 def split(a, n): # https://stackoverflow.com/a/2135920 k, m = divmod(len(a), n) @@ -464,16 +458,17 @@ def split(a, n): a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n) ) - protein_list = split(list(self.proteins.keys()), n_splits) + protein_list = split( + [value for key, value in sorted(self.proteins.items(), reverse=False)], + n_splits, + ) counter = 1 for protein_group in protein_list: with open_write( Path(outdir, f"fasta_split_{counter}.faa"), **kwargs ) as handle: - for k, v in { - key: self.proteins.get(key) for key in sorted(protein_group) - }.items(): - handle.writelines(f">{k}\n{v.sequence}\n") + for i in protein_group: + handle.writelines(f">{i.uid}\n{i.sequence}\n") counter += 1 def _merge_proteins(self, sg_object): @@ -534,7 +529,7 @@ def write_clustermap_json( ): raise NotImplementedError("write_clustermap_json needs to be updated") # TODO: Add link_df to clustermap - cmap = SerializeToClustermap(bgc_order=gene_cluster_order, sg_object=self) + cmap = SerializeToClustermap(sorted_bgcs=gene_cluster_order, sg_object=self) cmap.write(outpath=outpath) def write_genbank(self, outpath): @@ -544,8 +539,8 @@ def write_genbank(self, outpath): Seq(""), id=locus.external_id, name=locus.external_id, - description="An example GenBank file generated by BioPython", - dbxrefs=[f"Assembly:{locus.parent_object.uid}"], + description="A GenBank file generated by SocialGene.", + dbxrefs=[f"Assembly:{locus.parent.uid}"], ) # Add annotation for feature in locus.features: @@ -559,7 +554,7 @@ def write_genbank(self, outpath): qualifiers={ k: v for k, v in feature.all_attributes().items() - if v and k != "parent_object" + if v and k != "parent" } | {"translation": self.proteins[feature.uid].sequence}, ) @@ -646,7 +641,7 @@ def table_locus_to_protein(self, **kwargs): for feature in temp_list: if feature.feature_is_protein(): yield ( - feature.parent_object.uid, + feature.parent.uid, feature.uid, feature.external_id, feature.locus_tag, diff --git a/socialgene/cli/search/gene_cluster.py b/socialgene/cli/search/gene_cluster.py new file mode 100644 index 00000000..3b74b08a --- /dev/null +++ b/socialgene/cli/search/gene_cluster.py @@ -0,0 +1,255 @@ +import argparse +from pathlib import Path +from typing import List + +from socialgene.clustermap.serialize import SerializeToClustermap +from socialgene.config import env_vars +from socialgene.search.hmmer import SearchDomains +from socialgene.utils.logging import log + +env_vars["NEO4J_URI"] = "bolt://localhost:7687" + +parser = argparse.ArgumentParser( + description="Search a SocialGene database for input gene clusters similar to an input gene cluster" +) + + +parser.add_argument( + "--gbk_path", + metavar="input_filepath", + type=Path, + help="Path to the query gene cluster GenBank-format file", + required=True, +) +parser.add_argument( + "--hmm_dir", + metavar="hmm_directory", + type=str, + help="Path to the directory containing HMM files created by SocialGene's Nextflow workflow", + required=True, +) +parser.add_argument( + "--assemblies_must_have_x_matches", + metavar="fraction", + type=float, + default=0.6, + help="Minimum query proteins an assembly must have (<1 == fraction of query proteins)", +) +parser.add_argument( + "--nucleotide_sequences_must_have_x_matches", + metavar="fraction", + type=float, + default=0.6, + help="Minimum query proteins a nucleotide sequence must have (<1 == fraction of query proteins)", +) +parser.add_argument( + "--gene_clusters_must_have_x_matches", + metavar="fraction", + type=float, + default=0.6, + help="Minimum query proteins a target gene cluster sequence must have (<1 == fraction of query proteins)", +) +parser.add_argument( + "--break_bgc_on_gap_of", + metavar="value", + type=int, + default=10000, + help="Split gene clusters if gap between matched proteins is greater than this value", +) +parser.add_argument( + "--target_bgc_padding", + metavar="value", + type=int, + default=2000, + help="Pull proteins x-nucleotides on either side of target gene cluster", +) +parser.add_argument( + "--max_domains_per_protein", + metavar="value", + type=int, + default=3, + help="Maximum number of domains per protein used in search; 0 for all", +) +parser.add_argument( + "--max_outdegree", + metavar="value", + type=int, + help="Maximum outdegree a query domain can have; 0 for all", +) +parser.add_argument( + "--max_query_proteins", + metavar="value", + type=int, + default=10, + help="Maximum number of query proteins to use in search; 0 for all", +) +parser.add_argument( + "--scatter", + metavar="value", + help="Pull proteins from across the width of the input gene cluster", + action=argparse.BooleanOptionalAction, + default=False, +) +parser.add_argument( + "--locus_tag_bypass_list", + metavar="value", + nargs="+", + help="List of locus tags to bypass search filter", +) +parser.add_argument( + "--protein_id_bypass_list", + metavar="value", + nargs="+", + help="List of protein IDs to bypass search filter", +) +parser.add_argument( + "--only_culture_collection", + metavar="value", + help="Only search genomes from culture collections", + action=argparse.BooleanOptionalAction, + default=False, +) +parser.add_argument( + "--frac", + metavar="fraction", + type=float, + default=0.75, + help="Fraction of domains that equals a match in the initial database scan", +) +parser.add_argument( + "--run_async", + metavar="value", + help="Run the initial database scan using asynchronously", + action=argparse.BooleanOptionalAction, + default=True, +) +parser.add_argument( + "--analyze_with", + metavar="value", + type=str, + default="hmmer", + help="Tool to use for reciprocal best hit comparison; one of 'hmmer', 'blastp', 'mmseqs2'", +) +args = parser.parse_args() + + +def search_bgc( + gbk_path: str, + hmm_dir: str = None, + use_neo4j_precalc: bool = True, + assemblies_must_have_x_matches: float = 0.9, + nucleotide_sequences_must_have_x_matches: float = 0.9, + gene_clusters_must_have_x_matches: float = 0.9, + break_bgc_on_gap_of: int = 10000, + target_bgc_padding: int = 20000, + max_domains_per_protein: float = None, + max_outdegree: int = None, + max_query_proteins: int = 5, + scatter: bool = False, + locus_tag_bypass_list: List[str] = None, + protein_id_bypass_list: List[str] = None, + only_culture_collection: bool = False, + frac: float = 0.75, + run_async: bool = False, + analyze_with: str = "hmmer", +): + log.info(f"Running search with args: {locals()}") + search_object = SearchDomains( + gbk_path=gbk_path, + hmm_dir=hmm_dir, + use_neo4j_precalc=use_neo4j_precalc, + assemblies_must_have_x_matches=assemblies_must_have_x_matches, + nucleotide_sequences_must_have_x_matches=nucleotide_sequences_must_have_x_matches, + gene_clusters_must_have_x_matches=gene_clusters_must_have_x_matches, + break_bgc_on_gap_of=break_bgc_on_gap_of, + target_bgc_padding=target_bgc_padding, + ) + search_object.outdegree_table + search_object.prioritize_input_proteins( + max_domains_per_protein=max_domains_per_protein, + max_outdegree=max_outdegree, + max_query_proteins=max_query_proteins, + scatter=scatter, + locus_tag_bypass_list=locus_tag_bypass_list, + protein_id_bypass_list=protein_id_bypass_list, + ) + search_object.outdegree_table + search_object.search( + only_culture_collection=only_culture_collection, frac=frac, run_async=run_async + ) + # filters assemblies and nucleotide seqs that fall below threshold % of query proteins + search_object.filter() + # labels clusters with a unique id based on break_bgc_on_gap_of + search_object.label_clusters() + df = search_object._primary_bgc_regions() + df["n_start"] = df["n_start"] - search_object.target_bgc_padding + df["n_end"] = df["n_end"] + search_object.target_bgc_padding + search_object._bgc_regions_to_sg_object(df) + _ = search_object.sg_object.annotate_proteins_with_neo4j( + protein_uids=None, annotate_all=True, progress=False + ) + ######## + # add bgcs as gene_clusters to the locus objects + for i, row in df.iterrows(): + search_object.sg_object.assemblies[row["assembly_uid"]].get_locus_by_uid( + row["nucleotide_uid"] + ).add_bgcs_by_start_end( + start=row["n_start"] - search_object.target_bgc_padding, + end=row["n_end"] + search_object.target_bgc_padding, + uid=row["cluster"], + ) + # add input bgc as gene_cluster to the locus objects + if analyze_with == "hmmer": + search_object.sg_object.annotate_proteins_with_neo4j(annotate_all=True) + else: + search_object.sg_object.add_sequences_from_neo4j() + search_object._create_links( + tool=analyze_with, argstring="--fast --max-hsps 1", cpus=1 + ) + search_object._choose_group() + search_object._rank_order_bgcs() + assemblies = list( + dict.fromkeys([i.parent.parent for i in search_object.sorted_bgcs]) + ) + assemblies = [search_object.input_assembly] + assemblies + z = SerializeToClustermap( + sg_object=search_object.sg_object, + sorted_bgcs=[search_object.input_bgc] + search_object.sorted_bgcs, + link_df=search_object.link_df, + group_df=search_object.group_df, + ) + z.write("/Users/chase/Documents/test/cmap/clinker/clinker/plot/data.json") + + +def main(): + args = parser.parse_args() + + if args.max_domains_per_protein == 0: + args.max_domains_per_protein = None + if args.max_outdegree == 0: + args.max_outdegree = None + if args.max_query_proteins == 0: + args.max_query_proteins = None + _ = search_bgc( + gbk_path=args.gbk_path, + hmm_dir=args.hmm_dir, + use_neo4j_precalc=True, + assemblies_must_have_x_matches=args.assemblies_must_have_x_matches, + nucleotide_sequences_must_have_x_matches=args.nucleotide_sequences_must_have_x_matches, + gene_clusters_must_have_x_matches=args.gene_clusters_must_have_x_matches, + break_bgc_on_gap_of=args.break_bgc_on_gap_of, + target_bgc_padding=args.target_bgc_padding, + max_domains_per_protein=args.max_domains_per_protein, + max_outdegree=args.max_outdegree, + max_query_proteins=args.max_query_proteins, + scatter=args.scatter, + locus_tag_bypass_list=args.locus_tag_bypass_list, + protein_id_bypass_list=args.protein_id_bypass_list, + only_culture_collection=args.only_culture_collection, + frac=args.frac, + run_async=args.run_async, + ) + + +if __name__ == "__main__": + main() diff --git a/socialgene/clustermap/serialize.py b/socialgene/clustermap/serialize.py index ca7d8f32..467527fd 100644 --- a/socialgene/clustermap/serialize.py +++ b/socialgene/clustermap/serialize.py @@ -8,10 +8,9 @@ class SerializeToClustermap: Take a sg_object and serialize all BGC object to clustermap.js format """ - def __init__(self, sg_object, bgc_order, link_df, group_df): + def __init__(self, sg_object, sorted_bgcs, link_df, group_df): self._sg_object = sg_object - self._bgc_order = bgc_order - self._input_assembly = self._sg_object.assemblies[bgc_order[0]] + self.sorted_bgcs = sorted_bgcs self._link_df = link_df self._group_df = group_df self._reset() @@ -26,9 +25,12 @@ def _flatten_list(x): def _get_uid(self, obj=None): # increment then return a uid as a string - self._uid += 1 - self._uid_dict[str(self._uid)] = obj - return str(self._uid) + if obj in self._uid_dict: + return str(self._uid_dict[obj]) + else: + self._uid += 1 + self._uid_dict[str(self._uid)] = obj + return str(self._uid) def _build(self): self._reset() @@ -39,11 +41,13 @@ def _clusters(self): return { "clusters": [ { - "uid": self._get_uid(obj=self._sg_object.assemblies[k]), - "name": self._sg_object.assemblies[k].name, - "loci": self._loci(self._sg_object.assemblies[k]), + "uid": self._get_uid(obj=assembly), + "name": assembly.uid, + "loci": self._loci([i for i in assembly.gene_clusters]), } - for k in self._bgc_order + for assembly in list( + dict.fromkeys([i.parent.parent for i in self.sorted_bgcs]) + ) ] } @@ -73,14 +77,12 @@ def _locus(self, locus_name, locus_obj): "end": max((i.end for i in locus_obj.features)), } - def _loci(self, assembly_obj): + def _loci(self, loci): # return [ # self._locus(locus_name=k, locus_obj=v) for k, v in assembly_obj.loci.items() # ] return [ - self._locus(locus_name=k, locus_obj=gc) - for k, v in assembly_obj.loci.items() - for gc in v.gene_clusters + self._locus(locus_name=gc.parent.external_id, locus_obj=gc) for gc in loci ] def _create_groups_dict(self): @@ -92,14 +94,9 @@ def _create_groups_dict(self): with keys "uid", "label", and "genes". e.g. {"groups": [{"uid": "1", "label": "group1", "genes": ["1", "2", "3"]}] """ # Look in the clustermap uid dict (feature_to_cmap_uid_dict) for the cmap uid of each feature (query & target) in each group - self._group_df["query_cmap_uid"] = self._group_df["query"].apply( - lambda x: self.feature_to_cmap_uid_dict.get(x) - ) - self._group_df["target_cmap_uid"] = self._group_df["target"].apply( - lambda x: self.feature_to_cmap_uid_dict.get(x) - ) + log.info("Creating clustermap.js links") groups = ( - self._group_df.groupby(["target_cmap_uid", "target"])["query_cmap_uid"] + self._group_df.groupby(["query_feature"])["target_feature"] .apply(list) .reset_index() ) @@ -107,10 +104,11 @@ def _create_groups_dict(self): "groups": [ { "uid": self._get_uid( - obj=f"{i.target.external_id} {i.target.description}" + obj=f"{i['query_feature'].external_id} {i['query_feature'].description}" ), - "label": f"{i.target.external_id} {i.target.description}", - "genes": i.query_cmap_uid, + "label": f"{i['query_feature'].external_id} {i['query_feature'].description}", + "genes": [self.feature_to_cmap_uid_dict[i["query_feature"]]] + + [self.feature_to_cmap_uid_dict[i] for i in i.target_feature], } for x, i in groups.iterrows() ] @@ -124,18 +122,18 @@ def _create_links_dict(self): { "uid": self._get_uid(obj=None), "target": { - "uid": self.feature_to_cmap_uid_dict[i["target"]], - "name": self.feature_to_cmap_uid_dict[i["target"]], + "uid": self.feature_to_cmap_uid_dict[i["target_feature"]], + "name": self.feature_to_cmap_uid_dict[i["target_feature"]], }, "query": { - "uid": self.feature_to_cmap_uid_dict[i["query"]], - "name": self.feature_to_cmap_uid_dict[i["query"]], + "uid": self.feature_to_cmap_uid_dict[i["query_feature"]], + "name": self.feature_to_cmap_uid_dict[i["query_feature"]], }, - "identity": i.score, + "identity": i.pident if "pident" in i else i.score, } for x, i in self._link_df.iterrows() - if i["query"] in self.feature_to_cmap_uid_dict - and i["target"] in self.feature_to_cmap_uid_dict + if i["query_feature"] in self.feature_to_cmap_uid_dict + and i["target_feature"] in self.feature_to_cmap_uid_dict ] } @@ -145,4 +143,5 @@ def write(self, outpath): json.dump( self._build(), outfile, + indent=4, ) diff --git a/socialgene/compare_proteins/hmm/__init__.py b/socialgene/compare_gene_clusters/__init__.py similarity index 100% rename from socialgene/compare_proteins/hmm/__init__.py rename to socialgene/compare_gene_clusters/__init__.py diff --git a/socialgene/compare_gene_clusters/compare_gene_clusters.py b/socialgene/compare_gene_clusters/compare_gene_clusters.py new file mode 100644 index 00000000..98763b08 --- /dev/null +++ b/socialgene/compare_gene_clusters/compare_gene_clusters.py @@ -0,0 +1,30 @@ +from types import GeneratorType + +from socialgene.compare_proteins.diamond import DiamondBlastp +from socialgene.compare_proteins.hmmer import CompareDomains +from socialgene.compare_proteins.mmseqs import MMseqsEasySearch + + +class BGCComparison: + def __init__(self, tool): + match tool: + case "blastp": + self.tool = DiamondBlastp() + case "mmseqs2": + self.tool = MMseqsEasySearch() + case "hmmer": + self.tool = CompareDomains() + case _: + raise ValueError( + f"Tool {tool} not recognised, must be one of 'blastp', 'mmseqs2', 'hmmer'" + ) + + def compare(self, bgc1, bgc2, **kwargs): + if isinstance(bgc1, GeneratorType): + bgc1 = list(bgc1) + if isinstance(bgc2, GeneratorType): + bgc2 = list(bgc2) + return self.tool.reciprocal_hits( + self.tool.compare_proteins(bgc1, bgc2, **kwargs), + self.tool.compare_proteins(bgc2, bgc1, **kwargs), + ) diff --git a/socialgene/compare_proteins/base.py b/socialgene/compare_proteins/base.py new file mode 100644 index 00000000..35b6896b --- /dev/null +++ b/socialgene/compare_proteins/base.py @@ -0,0 +1,106 @@ +from abc import ABC, abstractmethod +from typing import List + +import pandas as pd + +from socialgene.base.molbio import Protein + +BlastTab_COLUMNS = { + "query": str, + "target": str, + "pident": "Float64", + "length": "Int64", + "mismatch": "Int64", + "gapopen": "Int64", + "qstart": "Int64", + "qend": "Int64", + "sstart": "Int64", + "send": "Int64", + "evalue": "Float64", + "score": "Float64", + "qlen": "Int64", + "slen": "Int64", +} + + +class ProteinComparison(ABC): + # TODO: force subclasses to implement self.score_column and self.score_scale and self.name + + @abstractmethod + def compare_proteins(self, p1: List[Protein], p2: List[Protein]): + ... + + def reciprocal_hits( + self, + q_vs_t_df: pd.DataFrame, + t_vs_q_df: pd.DataFrame, + ): + """This function takes a dataframe and returns a dataframe with the reciprocal best hits""" + q_vs_t_df = self.best_hit_to_query(q_vs_t_df) + t_vs_q_df = self.best_hit_to_query(t_vs_q_df) + t_vs_q_df.rename(columns={"query": "target", "target": "query"}, inplace=True) + df = pd.concat([q_vs_t_df, t_vs_q_df]) + del q_vs_t_df, t_vs_q_df + df.sort_values(by=self.score_column, ascending=False, inplace=True) + # remove non reciprocal + df = df[df.duplicated(["query", "target"], keep=False)] + df = df.drop_duplicates(subset=["query", "target", self.score_column]) + # keep the reciprocal hit with the highest bitscore + # but if multiple hits have the same highest bitscore, keep them all + return df[ + df.groupby(["query", "target"])[self.score_column].transform("max") + == df[self.score_column] + ] + + def best_hit_to_query( + self, + df: pd.DataFrame, + ): + return ( + df.reset_index(inplace=False, drop=True) + .sort_values(self.score_column, ascending=False) + .drop_duplicates("query", keep="first") + .reset_index(inplace=False, drop=True) + ) + + +class HMMDataFrame(ProteinComparison): + def __init__(self, score_column="score"): + self.score_column = score_column + + +class BlastTab(ProteinComparison): + def __init__(self, score_column="score"): + self.score_column = score_column + self.score_scale = float("inf") + + # def reciprocal_hits(self, q_vs_t_df: pd.DataFrame, t_vs_q_df: pd.DataFrame): + # # Create a new column in both dataframes: normalised bitscore + # q_vs_t_df["norm_bitscore"] = q_vs_t_df.bitscore / q_vs_t_df.length + # t_vs_q_df["norm_bitscore"] = t_vs_q_df.bitscore / t_vs_q_df.length + # # Create query and subject coverage columns in both dataframes + # q_vs_t_df["qcov"] = (q_vs_t_df.length / q_vs_t_df.qlen).clip(upper=1) + # t_vs_q_df["qcov"] = (t_vs_q_df.length / t_vs_q_df.qlen).clip(upper=1) + # q_vs_t_df["scov"] = (q_vs_t_df.length / q_vs_t_df.slen).clip(upper=1) + # t_vs_q_df["scov"] = (t_vs_q_df.length / t_vs_q_df.slen).clip(upper=1) + # q_vs_t_df = self.best_hit_to_query(q_vs_t_df) + # t_vs_q_df = self.best_hit_to_query(t_vs_q_df) + # t_vs_q_df.rename(columns={"query": "target", "target": "query"}, inplace=True) + # df = pd.concat([q_vs_t_df, t_vs_q_df]) + # del q_vs_t_df, t_vs_q_df + # df.sort_values(by="bitscore", ascending=False, inplace=True) + # # remove non reciprocal + # df = df[df.duplicated(["query", "target"], keep=False)] + # # keep the reciprocal hit with the highest bitscore + # # but if multiple hits have the same highest bitscore, keep them all + # return df[ + # df.groupby(["query", "target"])["bitscore"].transform(max) == df["bitscore"] + # ] + + # def best_hit_to_query(self, df: pd.DataFrame): + # return ( + # df.reset_index(inplace=False, drop=True) + # .sort_values("bitscore", ascending=False) + # .drop_duplicates("query", keep="first") + # .reset_index(inplace=False, drop=True) + # ) diff --git a/socialgene/compare_proteins/base_class.py b/socialgene/compare_proteins/base_class.py deleted file mode 100644 index 9cb4a020..00000000 --- a/socialgene/compare_proteins/base_class.py +++ /dev/null @@ -1,42 +0,0 @@ -from abc import ABC, abstractmethod -from collections import namedtuple - - -class CompareProteinsBaseClass(ABC): - _create_tuple = namedtuple( - "standard_protein_comparison", - ( - "query", - "target", - "score", - ), - ) - - def __init__(self): - self.protein_comparisons = [] - - @property - @abstractmethod - def df(self): - ... - # return ( - # pd.DataFrame(self.protein_comparisons) - # .sort_values(by=["mod_score"], ascending=False) - # .reset_index(inplace=False, drop=True) - # ) - - @abstractmethod - def compare_one_to_one(self, p1, p2): - ... - - @abstractmethod - def compare_one_to_many(self, p1_obj, p2_obj_list): - ... - - @abstractmethod - def compare_many_to_many(self, p1_obj_list, p2_obj_list): - ... - - @abstractmethod - def compare_all_to_all(self): - ... diff --git a/socialgene/compare_proteins/diamond.py b/socialgene/compare_proteins/diamond.py new file mode 100644 index 00000000..6024151e --- /dev/null +++ b/socialgene/compare_proteins/diamond.py @@ -0,0 +1,140 @@ +import os +import tempfile +from pathlib import Path + +import pandas as pd + +from socialgene.base.molbio import Protein +from socialgene.compare_proteins.base import BlastTab, BlastTab_COLUMNS +from socialgene.utils.logging import log +from socialgene.utils.run_subprocess import run_subprocess + + +class DiamondBlastp(BlastTab): + def __init__(self): + super().__init__() + self.temp_db_path = None + self.df = None + self.name = "Diamond BLASTp" + + def make_db( + self, + fasta_string: str, + db_path=None, + threads=1, + **kwargs, + ): + db_path = Path(db_path).with_suffix(".dmnd") + # run diamond makedb on fasta_file allow passing kwargs to diamond + command_list = [ + "diamond", + "makedb", + "--db", + str(db_path), + "--threads", + str(threads), + "--verbose", + ] + mes = run_subprocess( + command_list=command_list, + input=fasta_string, + text=True, + capture_output=True, + status=False, + **kwargs, + ) + log.debug(mes) + if not Path(db_path).exists(): + raise FileNotFoundError( + f"No database from DiamondBlastp make_db found at {db_path}" + ) + + @staticmethod + def run(fasta_path, db_path=None, threads=1, argstring="", **kwargs): + # run diamond blastp on query_fasta_string against db_path + if not Path(db_path).exists(): + raise FileNotFoundError( + f"No database from Diamond makedb found at {db_path}" + ) + with tempfile.TemporaryDirectory() as tmpdirname: + outpath = os.path.join(tmpdirname, "result.m8") + command_list = [ + "diamond", + "blastp", + "--query", + str(fasta_path), + "--db", + str(db_path), + "--out", + str(outpath), + "--threads", + str(threads), + "--verbose", + "--outfmt", + "6", + "qseqid", + "sseqid", + "pident", + "length", + "mismatch", + "gapopen", + "qstart", + "qend", + "sstart", + "send", + "evalue", + "bitscore", + "qlen", + "slen", + ] + if argstring: + argstring = [str(i) for i in argstring.split()] + command_list.extend(argstring) + mes = run_subprocess( + command_list=command_list, + check=False, + shell=False, + capture_output=True, + status=False, + ) + log.debug(mes) + if not os.path.exists(outpath): + log.warning("No output from MMseqs2 search") + else: + # sorted for reproducibility + return pd.read_csv( + outpath, + sep="\t", + header=None, + names=BlastTab_COLUMNS, + dtype=BlastTab_COLUMNS, + ).sort_values(["score", "query"], ascending=False) + + def compare_proteins( + self, queries, targets, cpus=1, argstring="--ultra-sensitive --max-hsps 1" + ): + # loop through protein list and write to temporary fasta file + if isinstance(queries, Protein): + queries = [queries] + if isinstance(targets, Protein): + targets = [targets] + if not all([isinstance(i, Protein) for i in queries]): + raise TypeError("queries must be a list of Protein objects") + if not all([isinstance(i, Protein) for i in targets]): + raise TypeError("targets must be a list of Protein objects") + with tempfile.TemporaryDirectory() as tmpdirname: + query_fasta_path = Path(tmpdirname, "queries.faa") + target_dmnd_path = Path(tmpdirname, "targets.dmnd") + with open(query_fasta_path, "w") as h: + for protein in queries: + h.write(protein.fasta_string_defline_uid) + self.make_db( + fasta_string="".join([i.fasta_string_defline_uid for i in targets]), + db_path=target_dmnd_path, + ) + return self.run( + fasta_path=query_fasta_path, + db_path=target_dmnd_path, + threads=cpus, + argstring=argstring, + ) diff --git a/socialgene/compare_proteins/hmm/hmmer.py b/socialgene/compare_proteins/hmm/hmmer.py deleted file mode 100644 index 678d5117..00000000 --- a/socialgene/compare_proteins/hmm/hmmer.py +++ /dev/null @@ -1,62 +0,0 @@ -from itertools import combinations, product -from multiprocessing import Pool - -import pandas as pd - -from socialgene.compare_proteins.base_class import CompareProteinsBaseClass -from socialgene.compare_proteins.hmm.scoring import _mod_score_tupler, mod_score - - -def picklable_modscore(p1, p2): - # named tuple doesn't work in multiprocessing - return mod_score(p1, p2)._asdict() - - -class CompareDomains(CompareProteinsBaseClass): - def __init__(self): - self.protein_comparisons = set() - - @property - def mod_score_df(self): - return ( - pd.DataFrame(self.protein_comparisons) - .sort_values(by=["mod_score"], ascending=False) - .reset_index(inplace=False, drop=True) - ) - - @property - def df(self): - return self.mod_score_df.filter(["query", "target", "mod_score"]).rename( - columns={"mod_score": "score"} - ) - - def compare_one_to_one(self, p1, p2): - return mod_score(p1, p2) - - def compare_one_to_many(self, p1_obj, p2_obj_list, filter=True): - for i in p2_obj_list: - temp = mod_score(p1_obj, i) - if not filter or temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_many_to_many(self, p1_obj_list, p2_obj_list, filter=True): - for i1, i2 in product(p1_obj_list, p2_obj_list): - temp = mod_score(i1, i2) - if not filter or temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_all_to_all(self, p1_obj_list, filter=True): - for i1, i2 in combinations(p1_obj_list, 2): - temp = mod_score(i1, i2) - if not filter or temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_all_to_all_parallel(self, p1_obj_list, cpus=1, only_hits=True): - # have to use _calculate_mod_score_not_named because named tuple can't pickle "protein_comparison_modscore" - with Pool(cpus) as p: - for i in p.starmap( - picklable_modscore, - combinations(p1_obj_list, 2), - ): - if not only_hits or i["jaccard"] > 0.001: - self.protein_comparisons.add(_mod_score_tupler(**i)) diff --git a/socialgene/compare_proteins/hmm/scoring.py b/socialgene/compare_proteins/hmm_scoring.py similarity index 91% rename from socialgene/compare_proteins/hmm/scoring.py rename to socialgene/compare_proteins/hmm_scoring.py index 228c9c46..1b221a5c 100644 --- a/socialgene/compare_proteins/hmm/scoring.py +++ b/socialgene/compare_proteins/hmm_scoring.py @@ -27,7 +27,7 @@ def mod_score(p1, p2): p1 (Protein): SocialGene Protein Class p1 (Protein): SocialGene Protein Class Returns: - dict: {l1, l2, levenshtein, jaccard, mod_score}; mod_score -> 2 = Perfectly similar; otherwise (1/Levenshtein + Jaccard) + dict: {ProteinClass, ProteinClass, levenshtein, jaccard, mod_score}; mod_score -> 2 = Perfectly similar; otherwise (1/Levenshtein + Jaccard) """ # If either protein contains no HMM annotations, # return a mod score with the worst scores possible @@ -35,10 +35,10 @@ def mod_score(p1, p2): raise TypeError(f"p1 type: {type(p1)}; p2 type: {type(p2)}") length_input_list_1 = len(p1.domains) length_input_list_2 = len(p2.domains) - if p1.uid == p2.uid: + if p1 == p2: return _mod_score_tupler( - p1.uid, - p2.uid, + p1, + p2, length_input_list_1, length_input_list_2, round(0, 2), @@ -49,8 +49,8 @@ def mod_score(p1, p2): # If either protein contains no HMM annotations, # return a mod score with the worst scores possible return _mod_score_tupler( - p1.uid, - p2.uid, + p1, + p2, length_input_list_1, length_input_list_2, round(100, 2), @@ -81,8 +81,8 @@ def mod_score(p1, p2): mod_score_value = (jaccard_score * 0.5) + mod_levenshtein_score return _mod_score_tupler( - p1.uid, - p2.uid, + p1, + p2, length_input_list_1, length_input_list_2, round(mod_levenshtein_score, 2), diff --git a/socialgene/compare_proteins/hmmer.py b/socialgene/compare_proteins/hmmer.py new file mode 100644 index 00000000..c9cfd8de --- /dev/null +++ b/socialgene/compare_proteins/hmmer.py @@ -0,0 +1,68 @@ +from itertools import combinations, product +from multiprocessing import Pool + +import pandas as pd + +from socialgene.compare_proteins.base import HMMDataFrame +from socialgene.compare_proteins.hmm_scoring import _mod_score_tupler, mod_score + + +def picklable_modscore(p1, p2): + # named tuple doesn't work in multiprocessing + return mod_score(p1, p2)._asdict() + + +class CompareDomains(HMMDataFrame): + def __init__(self): + self.name = "HMMER annotation comparison with SocialGene" + self.score_column = "score" + + def compare_proteins(self, queries, targets): + return pd.DataFrame( + ( + { + "query": i.query.uid, + "target": i.target.uid, + "query_n_domains": i.query_n_domains, + "target_n_domains": i.target_n_domains, + "jaccard": i.jaccard, + "score": i.mod_score, + } + for i in self.compare_many_to_many( + queries, targets, filter_non_hits=True + ) + ), + ).drop_duplicates(subset=["query", "target"]) + + def compare_one_to_one(self, p1, p2): + protein_comparisons = set() + return self.protein_comparisons_df(protein_comparisons) + + def compare_one_to_many(self, p1_obj, p2_obj_list, filter_non_hits=True): + for i in p2_obj_list: + temp = mod_score(p1_obj, i) + if not filter_non_hits or temp.jaccard > 0: + yield temp + + def compare_many_to_many(self, p1_obj_list, p2_obj_list, filter_non_hits=True): + for i1, i2 in product(p1_obj_list, p2_obj_list): + temp = mod_score(i1, i2) + if not filter_non_hits or temp.jaccard > 0: + yield temp + + def compare_all_to_all(self, p1_obj_list, filter_non_hits=True): + for i1, i2 in combinations(p1_obj_list, 2): + temp = mod_score(i1, i2) + if not filter_non_hits or temp.jaccard > 0: + yield temp + + def compare_all_to_all_parallel(self, p1_obj_list, cpus=1, only_hits=True): + # have to use _calculate_mod_score_not_named because named tuple can't pickle "protein_comparison_modscore" + with Pool(cpus) as p: + for i in p.starmap( + picklable_modscore, + combinations(p1_obj_list, 2), + ): + if not only_hits or i["jaccard"] > 0.001: + temp = _mod_score_tupler(**i) + yield temp diff --git a/socialgene/compare_proteins/mmseqs.py b/socialgene/compare_proteins/mmseqs.py new file mode 100644 index 00000000..2cc3c60f --- /dev/null +++ b/socialgene/compare_proteins/mmseqs.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 + +import logging +import os +import tempfile +from pathlib import Path + +import pandas as pd + +from socialgene.base.molbio import Protein +from socialgene.compare_proteins.base import BlastTab, BlastTab_COLUMNS +from socialgene.utils.logging import log +from socialgene.utils.run_subprocess import run_subprocess + +# mmseqs must be present on PATH + + +class MMseqsEasySearch(BlastTab): + def __init__(self): + super().__init__() + self.temp_db_path = None + self.df = None + self.name = "MMseqs2 EasySearch" + + @staticmethod + def make_db( + fasta_string: str, + db_path=None, + **kwargs, + ): + """Run MMseqs2 createdb on an input fasta file + + Args: + fasta_path (str): Path to fasta file + target_database (str): Path to MMseqs2 database + """ + + command_list = [ + "mmseqs", + "createdb", + "stdin", + str(db_path), + ] + mes = run_subprocess( + command_list=command_list, + input=fasta_string, + text=True, + capture_output=True, + status=False, + **kwargs, + ) + log.debug(mes) + if not Path(db_path).exists(): + raise FileNotFoundError( + f"No database from mmseqs make_db found at {db_path}" + ) + + @staticmethod + def index_database(database_path): + """Run MMseqs2 createindex on an MMseqs2 database + + Args: + target_database (str): Path to MMseqs2 database + """ + with tempfile.TemporaryDirectory() as tmpdirname: + command_list = [ + "mmseqs", + "createindex", + str(database_path), + str(tmpdirname), + ] + mes = run_subprocess( + command_list=command_list, + check=False, + shell=False, + capture_output=True, + status=False, + ) + logging.debug(mes) + if not os.path.exists(f"{database_path}.idx"): + logging.warning("No output from MMseqs2 createindex") + + def run( + self, + fasta_path, + target_database, + cpus=1, + argstring="", + ): + """Search an input fasta file against a target database with external MMseqs2 program + + Args: + fasta_path (str): Path to fasta file + target_database (str): Path to MMseqs2 database + argstring (str): additonal arguments to pass to mmseqs search, as a string + = + Returns: + pandas_df: dataframe of the mmseqs search result + """ + with tempfile.TemporaryDirectory() as tmpdirname: + outpath = os.path.join(tmpdirname, "result.m8") + command_list = [ + "mmseqs", + "easy-search", + str(fasta_path), + str(target_database), + str(outpath), + str(tmpdirname), + "--format-mode", + "0", + "--format-output", + "query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen", + "--threads", + str(cpus), + ] + if argstring: + argstring = [str(i) for i in argstring.split()] + command_list.extend(argstring) + mes = run_subprocess( + command_list=command_list, + check=False, + shell=False, + capture_output=True, + status=False, + ) + logging.debug(mes) + if not os.path.exists(outpath): + logging.warning("No output from MMseqs2 search") + else: + # sorted for reproducibility + return pd.read_csv( + outpath, + sep="\t", + names=BlastTab_COLUMNS, + dtype=BlastTab_COLUMNS, + ).sort_values(["score", "query"], ascending=False) + + def compare_proteins(self, queries, targets, cpus=1, argstring="", index=False): + # loop through protein list and write to temporary fasta file + if isinstance(queries, Protein): + queries = [queries] + if isinstance(targets, Protein): + targets = [targets] + if not all([isinstance(i, Protein) for i in queries]): + raise TypeError("queries must be a list of Protein objects") + if not all([isinstance(i, Protein) for i in targets]): + raise TypeError("targets must be a list of Protein objects") + + with tempfile.TemporaryDirectory() as tmpdirname: + query_fasta_path = Path(tmpdirname, "queries.faa") + target_dmnd_path = Path(tmpdirname, "targets") + with open(query_fasta_path, "w") as h: + for protein in queries: + h.write(protein.fasta_string_defline_uid) + self.make_db( + fasta_string="".join([i.fasta_string_defline_uid for i in targets]), + db_path=target_dmnd_path, + ) + if index: + self.index_database(target_dmnd_path) + return self.run( + fasta_path=query_fasta_path, + target_database=target_dmnd_path, + cpus=cpus, + argstring=argstring, + ) diff --git a/socialgene/compare_proteins/mmseqs_WIP.py b/socialgene/compare_proteins/mmseqs_WIP.py deleted file mode 100644 index 42ff592c..00000000 --- a/socialgene/compare_proteins/mmseqs_WIP.py +++ /dev/null @@ -1,98 +0,0 @@ -# flake8: noqa -from itertools import combinations, product -from multiprocessing import Pool -from pathlib import Path - -import pandas as pd - -from socialgene.compare_proteins.base_class import CompareProteinsBaseClass -from socialgene.mmseqs.create_database import create_database -from socialgene.mmseqs.search import search -from socialgene.mmseqs.subset_database import createsubdb - -target_proteins = list(sg.proteins.values()) - -# with tempfile.TemporaryDirectory() as tmpdirname: - -tmpdirname = "/home/chase/Downloads/work" -fasta_path = Path(tmpdirname, "target_fasta.fa") -mmseqs_path = "/home/chase/Downloads/work/mmseqsdb" - -with open(fasta_path, "w") as handle: - handle.writelines( - (i.fasta_string_defline_uid for i in target_proteins if i.sequence) - ) - - -create_database(fasta_path, mmseqs_path) - -mmseqs_lookup = "/home/chase/Downloads/work/mmseqsdb.lookup" -mmseqs_output_subset_ids = "/home/chase/Downloads/work/subset_ids" -mmseqs_subset_db = "/home/chase/Downloads/work/subset_db" - -# create a subdb using protein hash ids -protids = [i.uid for i in target_proteins][0:4] -with open(mmseqs_lookup, "r") as mml: - with open(mmseqs_output_subset_ids, "w") as h: - for i in mml: - temp = i.split("\t") - if temp[1] in protids: - z = "\t" - h.writelines(f"{i.split(z)[0]}\n") - -createsubdb(olddb=mmseqs_path, newdb=mmseqs_subset_db, idfile=mmseqs_output_subset_ids) - - -a = search(fasta_path, mmseqs_subset_db) - - -class CompareDiamond(CompareProteinsBaseClass): - def __init__(self): - self.protein_comparisons = set() - - @property - def mod_score_df(self): - return ( - pd.DataFrame(self.protein_comparisons) - .sort_values(by=["mod_score"], ascending=False) - .reset_index(inplace=False, drop=True) - ) - - @property - def df(self): - return self.mod_score_df.filter(["query", "target", "mod_score"]).rename( - columns={"mod_score": "score"} - ) - - def compare_one_to_one(self, p1, p2): - return self.calculate_mod_score(p1, p2) - - def compare_one_to_many(self, p1_obj, p2_obj_list): - for i in p2_obj_list: - temp = self.calculate_mod_score(p1_obj, i) - if temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_many_to_many(self, p1_obj_list, p2_obj_list): - for i1, i2 in product(p1_obj_list, p2_obj_list): - temp = self.calculate_mod_score(i1, i2) - if temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_all_to_all(self, p1_obj_list): - for i1, i2 in combinations(p1_obj_list, 2): - temp = self.calculate_mod_score(i1, i2) - if temp.jaccard > 0: - temp - self.protein_comparisons.add(temp) - - def compare_all_to_all_parallel(self, p1_obj_list, cpus=1): - # have to use _calculate_mod_score_not_named because named tuple can't pickle "protein_comparison_modscore" - with Pool(cpus) as p: - for i in p.starmap( - _calculate_mod_score_not_named, - combinations(p1_obj_list, 2), - ): - # i[5] == jaccard - if i[5] > 0.001: - self.protein_comparisons.add(_create_tuple(*i)) diff --git a/socialgene/hmm/hmmer.py b/socialgene/hmm/hmmer.py index 2e402a58..2e8d784b 100644 --- a/socialgene/hmm/hmmer.py +++ b/socialgene/hmm/hmmer.py @@ -20,7 +20,7 @@ class HMMER: def __init__(self, hmm_filepath=None): self._check_hmmer_exists() - self.input_path = hmm_filepath + self.input_path = Path(hmm_filepath) self.decompressed_hmm_path = None @staticmethod diff --git a/socialgene/mmseqs/__init__.py b/socialgene/mmseqs/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/socialgene/mmseqs/create_database.py b/socialgene/mmseqs/create_database.py deleted file mode 100644 index 89032fc7..00000000 --- a/socialgene/mmseqs/create_database.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 - - -import argparse -import logging -import os - -from socialgene.utils.run_subprocess import run_subprocess - -# mmseqs must be present on PATH - - -parser = argparse.ArgumentParser( - description="Run MMseqs2 createdb on an input fasta file" -) - -parser.add_argument( - "input_fasta_path", - help="Path to fasta file", -) - -parser.add_argument( - "target_database_path", - help="Path to fasta file", -) - - -def create_database(fasta_path, database_path): - """Run MMseqs2 createdb on an input fasta file - - Args: - fasta_path (str): Path to fasta file - target_database (str): Path to MMseqs2 database - """ - - command_list = [ - "mmseqs", - "createdb", - str(fasta_path), - str(database_path), - ] - mes = run_subprocess( - command_list=command_list, check=False, shell=False, capture_output=True - ) - logging.debug(mes) - if not os.path.exists(database_path): - logging.warning("No output from MMseqs2 create_database") - - -def main(): - args = parser.parse_args() - - fasta_path = args.input_fasta_path - target_database = args.target_database_path - - if not os.path.exists(fasta_path): - raise FileExistsError(fasta_path) - - create_database(fasta_path, target_database) - - -if __name__ == "__main__": - main() diff --git a/socialgene/mmseqs/index_database.py b/socialgene/mmseqs/index_database.py deleted file mode 100644 index 39a745c0..00000000 --- a/socialgene/mmseqs/index_database.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 - - -import argparse -import logging -import os -import tempfile - -from socialgene.utils.run_subprocess import run_subprocess - -# mmseqs must be present on PATH - - -parser = argparse.ArgumentParser( - description="Run MMseqs2 createindex on an MMseqs2 database" -) - -parser.add_argument( - "target_database_path", - help="Path to fasta file", -) - - -def index_database(database_path): - """Run MMseqs2 createindex on an MMseqs2 database - - Args: - target_database (str): Path to MMseqs2 database - """ - with tempfile.TemporaryDirectory() as tmpdirname: - command_list = ["mmseqs", "createindex", database_path, tmpdirname] - mes = run_subprocess( - command_list=command_list, check=False, shell=False, capture_output=True - ) - logging.debug(mes) - if not os.path.exists(f"{database_path}.idx"): - logging.warning("No output from MMseqs2 createindex") - - -def main(): - args = parser.parse_args() - target_database = args.target_database_path - - if not os.path.exists(target_database): - raise FileExistsError(target_database) - - index_database(target_database) - - -if __name__ == "__main__": - main() diff --git a/socialgene/mmseqs/search.py b/socialgene/mmseqs/search.py deleted file mode 100644 index ea3bd8a9..00000000 --- a/socialgene/mmseqs/search.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 - - -import argparse -import logging -import os -import tempfile - -import pandas as pd - -from socialgene.utils.run_subprocess import run_subprocess - -# mmseqs must be present on PATH - - -parser = argparse.ArgumentParser( - description="Run MMseqs2 search on an input amino acid sequence" -) - -parser.add_argument( - "input_fasta_path", - help="Path to fasta file", -) - -parser.add_argument( - "target_database_path", - help="Path to fasta file", -) - -MMSEQS_OUT_COLUMNS = { - "query": str, - "target": str, - "pident": "Float32", - "alnlen": "Int32", - "mismatch": "Int32", - "gapopen": "Int32", - "qstart": "Int32", - "qend": "Int32", - "tstart": "Int32", - "tend": "Int32", - "evalue": "Float32", - "bits": "Int32", -} - - -def search(fasta_path, target_database, argstring=""): - """Search an input fasta file against a target database with external MMseqs2 program - - Args: - fasta_path (str): Path to fasta file - target_database (str): Path to MMseqs2 database - argstring (str): additonal arguments to pass to mmseqs search, as a string - = - Returns: - pandas_df: dataframe of the mmseqs search result - """ - with tempfile.TemporaryDirectory() as tmpdirname: - outpath = os.path.join(tmpdirname, "result.m8") - command_list = [ - "mmseqs", - "easy-search", - str(fasta_path), - str(target_database), - str(outpath), - str(tmpdirname), - "--format-mode", - "0", - ] - if argstring: - argstring = [str(i) for i in argstring.split()] - command_list.extend(argstring) - mes = run_subprocess( - command_list=command_list, check=False, shell=False, capture_output=True - ) - logging.info(mes) - if not os.path.exists(outpath): - logging.warning("No output from MMseqs2 search") - else: - return ( - pd.read_csv( - outpath, - sep="\t", - names=MMSEQS_OUT_COLUMNS, - dtype=MMSEQS_OUT_COLUMNS, - ) - .sort_values(["pident", "bits"], ascending=False) - .reset_index(drop=True) - ) - - -def main(): - args = parser.parse_args() - - fasta_path = args.input_fasta_path - target_database = args.target_database_path - - if not os.path.exists(fasta_path): - raise FileExistsError(fasta_path) - if not os.path.exists(target_database): - raise FileExistsError(target_database) - - print(search(fasta_path, target_database)) - - -if __name__ == "__main__": - main() diff --git a/socialgene/mmseqs/subset_database.py b/socialgene/mmseqs/subset_database.py deleted file mode 100644 index 33c56546..00000000 --- a/socialgene/mmseqs/subset_database.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - - -import argparse -import logging -import os - -from socialgene.utils.run_subprocess import run_subprocess - -# mmseqs must be present on PATH - - -parser = argparse.ArgumentParser( - description="Run MMseqs2 subsetdb on an input fasta file" -) - -parser.add_argument( - "idfile", - help="Path to idfile file", -) - -parser.add_argument( - "olddb", - help="Path to olddb", -) - -parser.add_argument( - "newdb", - help="Path to newdb", -) - - -def createsubdb(olddb, newdb, idfile): - command_list = [ - "mmseqs", - "createsubdb", - str(idfile), - str(olddb), - str(newdb), - ] - mes = run_subprocess( - command_list=command_list, check=False, shell=False, capture_output=True - ) - logging.debug(mes) - if not os.path.exists(newdb): - logging.warning("No output from MMseqs2 createsubdb") - - -def main(): - args = parser.parse_args() - createsubdb(args.olddb, args.newdb, args.idfile) - - -if __name__ == "__main__": - main() diff --git a/socialgene/parsers/fasta.py b/socialgene/parsers/fasta.py index d0f24730..b27da477 100644 --- a/socialgene/parsers/fasta.py +++ b/socialgene/parsers/fasta.py @@ -77,7 +77,7 @@ def parse_fasta_file( _open = fh.open_read(input) input_from = input with _open as handle: - self.add_assembly(uid=assembly_id) + self.add_assembly(uid=assembly_id, parent=self) self.assemblies[assembly_id].add_locus(external_id=assembly_id) record_counter = 0 for seq_record in SeqIO.parse(handle, "fasta"): diff --git a/socialgene/parsers/genbank.py b/socialgene/parsers/genbank.py index 0d301ebe..ed4a5436 100644 --- a/socialgene/parsers/genbank.py +++ b/socialgene/parsers/genbank.py @@ -51,7 +51,7 @@ def _add_assembly(self, input_path, seq_record): if not assembly_id: # use random unique id as assembly id assembly_id = str(uuid4()) - self.add_assembly(uid=assembly_id) + self.add_assembly(uid=assembly_id, parent=self) return assembly_id def _add_locus(self, seq_record, assembly_id): diff --git a/socialgene/parsers/hmmmodel.py b/socialgene/parsers/hmmmodel.py index cf70f315..da8dc6cb 100644 --- a/socialgene/parsers/hmmmodel.py +++ b/socialgene/parsers/hmmmodel.py @@ -342,7 +342,7 @@ def read(self, filepath, base_dir=None): for i, model in enumerate( self.read_model_generator(filepath=filepath, base_dir=base_dir) ): - log.debug(f"Reading model {str(i+1)} from {filepath}") + log.debug(f"Reading model {str(i + 1)} from {filepath}") model._n = self.dict_key_index self.models[self.dict_key_index] = model self.dict_key_index += 1 @@ -366,7 +366,6 @@ def read_model_generator(self, filepath, base_dir=None): # switch to model addition _add_line_contents = self.temp_model.add_model if line.startswith("//"): - log.warning(Path(base_dir).name) # yield model self.temp_model.add_model_hash() self.temp_model.find_pfam_accessions() diff --git a/socialgene/scoring/__init__.py b/socialgene/scoring/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/socialgene/search/base.py b/socialgene/search/base.py index f505f4b7..f6fe5019 100644 --- a/socialgene/search/base.py +++ b/socialgene/search/base.py @@ -12,11 +12,11 @@ TimeElapsedColumn, ) from rich.table import Table -from textdistance import levenshtein +from textdistance import jaccard, levenshtein from socialgene.base.socialgene import SocialGene from socialgene.clustermap.serialize import SerializeToClustermap -from socialgene.compare_proteins.hmm.hmmer import CompareDomains +from socialgene.compare_gene_clusters.compare_gene_clusters import BGCComparison from socialgene.neo4j.neo4j import GraphDriver from socialgene.utils.logging import log @@ -63,7 +63,7 @@ class SearchBase(ABC): break_bgc_on_gap_of (int): Number of base pairs to break a BGC on if a gap is greater than this value. sg_object (SocialGene): SocialGene object containing the database. raw_search_results_df (pd.DataFrame): Initial search results as a pandas DataFrame. - bgc_order (pd.DataFrame): Order of the BGCs in the search results as a pandas DataFrame. + sorted_bgcs (pd.DataFrame): Order of the BGCs in the search results as a pandas DataFrame. link_df (pd.DataFrame): Linkage information between BGCs in the search results as a pandas DataFrame. group_df (pd.DataFrame): Grouping information for BGCs in the search results as a pandas DataFrame. n_searched_proteins (int): Number of proteins searched in the database. @@ -100,7 +100,7 @@ def __init__( self.break_bgc_on_gap_of = break_bgc_on_gap_of self.sg_object = SocialGene() self.raw_search_results_df = pd.DataFrame() - self.bgc_order = None + self.sorted_bgcs = None self.link_df = pd.DataFrame() self.group_df = pd.DataFrame() self.n_searched_proteins = None @@ -149,21 +149,28 @@ def _modify_input_bgc_name(self): ].uid = self.modified_input_bgc_name self.input_assembly = self.sg_object.assemblies[self.modified_input_bgc_name] + def create_input_bgcs(self): + for locus in self.input_assembly.loci.values(): + locus.add_bgcs_by_feature(features=locus.features) + self.input_bgc = list(self.input_assembly.gene_clusters)[0] + def _input_bgc_info(self): log.info( - f"Input BGC has {len(self.sg_object.proteins)} proteins and/or psuedogenes" + f"Input BGC has {len(self.sg_object.proteins)} proteins and/or pseudogenes" ) def read_sg_object(self, sg_object: SocialGene): self.sg_object = sg_object self._modify_input_bgc_name() self._input_bgc_info() + self.create_input_bgcs() def read_input_bgc(self, gbk_path: str): # parse input BGC self.sg_object.parse(gbk_path) self._modify_input_bgc_name() self._input_bgc_info() + self.create_input_bgcs() def filter(self, drop_raw_search_results_df=False): if self.raw_search_results_df.empty: @@ -185,9 +192,9 @@ def filter(self, drop_raw_search_results_df=False): if self.working_search_results_df.empty: raise ValueError("No hits found after filtering at the assembly level") - def cluster(self): - # assign clusters of proteins that aren't interrupted by a gap greater than break_bgc_on_gap_of - self._label_clusters() + # def cluster(self): + # # assign clusters of proteins that aren't interrupted by a gap greater than break_bgc_on_gap_of + # self._label_clusters() def _primary_bgc_regions(self): return self._collapse_cluster( @@ -205,27 +212,49 @@ def annotate(self): def _rank_order_bgcs(self): # TODO: should be based off of create_links() output not bgc_df """Sorts assembly IDs by comparing the levenshtein distance of ordered query proteins (forward and reverse)""" - log.info("Start: Ranking BGCs") - - input_bgc_nuc_id = list(self.input_assembly.loci.keys())[0] - input_protein_order = [ - i.uid - for i in self.input_assembly.loci[ - input_bgc_nuc_id - ].features_sorted_by_midpoint - ] - temp_dict = {} - for i in self.bgc_df["assembly_uid"].unique(): - temp_list = self.bgc_df[self.bgc_df["assembly_uid"] == i]["query"].to_list() - forward = levenshtein(input_protein_order, temp_list) - temp_list.reverse() - reverse = levenshtein(input_protein_order, temp_list) - temp_dict[i] = min(forward, reverse) - self.bgc_order = [self.modified_input_bgc_name] - self.bgc_order.extend( - [k for k, v in sorted(temp_dict.items(), key=lambda item: item[1])] + + def hl(group): + a = levenshtein( + list(group.sort_values(["t_start"])["query"]), + list(group.sort_values(["q_start"], ascending=True)["query"]), + ) / len(group) + b = levenshtein( + list(group.sort_values(["t_start"])["query"]), + list(group.sort_values(["q_start"], ascending=False)["query"]), + ) / len(group) + lev = a if a < b else b + lev = 1 - lev + jac = jaccard( + list(group.sort_values(["t_start"])["query"]), + list(group.sort_values(["q_start"], ascending=False)["query"]), + ) + return (jac * 0.5) + lev + + q_obj = pd.DataFrame( + [ + { + "query_feature": i, + "q_start": i.start, + "strand": i.strand, + } + for i in list(self.input_bgc.features_sorted_by_midpoint) + ] ) - log.info("Finish: Ranking BGCs") + temp = { + k[1]: hl( + pd.merge( + q_obj, + group, + on="query_feature", + how="outer", + suffixes=("", "_delme"), + ) + ) + for k, group in self.link_df.groupby( + ["query_cluster", "target_cluster"], sort=False + ) + } + self.sorted_bgcs = sorted(temp, key=temp.get, reverse=True) def _bgc_regions_to_sg_object(self, collapsed_df): now = time.time() @@ -281,7 +310,9 @@ def _prune_links(self, df: pd.DataFrame) -> pd.DataFrame: df_to_return = pd.concat([df_to_return, temp]) return df_to_return - def _create_link_df(self, query_gene_cluster, target_gene_cluster): + def _create_link_df( + self, query_gene_cluster, target_gene_cluster, tool="blastp", **kwargs + ): """read the args, things are backward b/c context is searching the found bgc against the input bgc Args: @@ -291,38 +322,52 @@ def _create_link_df(self, query_gene_cluster, target_gene_cluster): Returns: _type_: _description_ """ - compare_domains = CompareDomains() - compare_domains.compare_many_to_many( - (self.sg_object.proteins[gc.uid] for gc in query_gene_cluster.features), - (self.sg_object.proteins[gc.uid] for gc in target_gene_cluster.features), + + comparator = BGCComparison( + tool=tool, ) - df_to_return = compare_domains.df[compare_domains.df["score"] > 0.75] + protein_comparisons_df = comparator.compare( + query_gene_cluster.protein_iter, target_gene_cluster.protein_iter, **kwargs + ) + protein_comparisons_df["query_protein"] = protein_comparisons_df["query"].apply( + lambda x: query_gene_cluster.proteins[x] + ) + protein_comparisons_df["target_protein"] = protein_comparisons_df[ + "target" + ].apply(lambda x: target_gene_cluster.proteins[x]) + # protein_comparisons_df.drop(columns=["query", "target"], inplace=True) q_obj = pd.DataFrame( [ - {"obj": i, "query": i.uid, "q_start": i.start, "strand": i.strand} + { + "query_cluster": query_gene_cluster, + "query_feature": i, + "query_protein": i.protein, + "q_start": i.start, + "strand": i.strand, + } for i in list(query_gene_cluster.features_sorted_by_midpoint) ] ) t_obj = pd.DataFrame( [ - {"obj": i, "target": i.uid, "t_start": i.start, "strand": i.strand} + { + "target_cluster": target_gene_cluster, + "target_feature": i, + "target_protein": i.protein, + "t_start": i.start, + "strand": i.strand, + } for i in list(target_gene_cluster.features_sorted_by_midpoint) ] ) - q_temp = pd.merge(df_to_return, q_obj, on="query", how="left") - temp = pd.merge(q_temp, t_obj, on="target", how="left").sort_values("t_start") - temp = temp[["obj_x", "obj_y", "score"]] - temp = temp.rename( - columns={ - "obj_x": "query", - "obj_y": "target", - } + q_temp = pd.merge(protein_comparisons_df, q_obj, on="query_protein", how="left") + return ( + pd.merge(q_temp, t_obj, on="target_protein", how="left") + .sort_values("t_start") + .rename(columns={comparator.tool.score_column: "score"}, inplace=False) ) - return temp - def _create_links( - self, - ) -> pd.DataFrame: + def _create_links(self, tool="hmmer", cutoff=None, **kwargs) -> pd.DataFrame: """ Loop through gene_cluster compare the proteins to the input BGC @@ -338,19 +383,43 @@ def _create_links( """ # Initialize an empty DataFrame to store the links log.info("Start: Creating links") - link_df = pd.DataFrame({"query": [], "target": [], "score": []}) - for k_assembly, v_assembly in self.sg_object.assemblies.items(): - for k_locus, v_locus in v_assembly.loci.items(): - for gene_cluster in v_locus.gene_clusters: + link_df = pd.DataFrame() + progress_bar = Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TextColumn("• Time elapsed "), + TimeElapsedColumn(), + transient=True, + ) + log.info( + f"Finding reciprocal best hits; protein similarity via {BGCComparison(tool=tool).tool.name}" + ) + with progress_bar as pg: + task = pg.add_task( + description="a", + total=len(list(self.sg_object.get_all_gene_clusters())), + ) + for target_gene_cluster in self.sg_object.get_all_gene_clusters(): + if target_gene_cluster != self.input_bgc: + pg.update( + task, + description=( + f"\nComparing {self.input_bgc_id} to {target_gene_cluster.parent.external_id}:{min([i.start for i in target_gene_cluster.features])}-{max([i.start for i in target_gene_cluster.features])}" + ), + ) link_df = pd.concat( [ link_df, self._create_link_df( - gene_cluster, - self.input_assembly.loci[self.input_bgc_id], + query_gene_cluster=self.input_bgc, + target_gene_cluster=target_gene_cluster, + tool=tool, ), ] ) + + pg.update(task, advance=1, description="[blue]Complete Task") self.link_df = link_df log.info(f"Finish: Creating links; {len(link_df)} links produced") if self.link_df.empty: @@ -364,6 +433,9 @@ def _choose_group( log.info("Finish: Assigning target BGC proteins to input BGC groups") log.warning("No links to group by, no groups produced") return + # df = self.link_df + # df["query"] = df["query"].apply(lambda x: x.uid) + # df["target"] = df["target"].apply(lambda x: x.uid) self.group_df = ( self.link_df.sort_values(by="score", ascending=False) .groupby(["query"], observed=False) @@ -446,13 +518,13 @@ def _sort_genes_by_start(self): def label_clusters( self, ): - """Walks through the df and for each nucleotide sequence, labels proteins within gap tolerance + """Walks through the df and for each nucleotide sequence, assigns genes/protein to a cluster, breaking on gap tolerance Args: break_bgc_on_gap_of (int, optional): Breaks a "cluster" when the diff of two proteins' starts is greater than this. Defaults to 20000. Returns: - pd.DataFrame: input df + cluster column (integers) + pd.DataFrame: input df + cluster column (integer) """ log.info( f"Grouping protein hits if less than {str(self.break_bgc_on_gap_of)} bp apart" @@ -507,7 +579,7 @@ def _filter_clusters(self, df, threshold) -> pd.Index: def write_clustermap_json(self, outpath): cmap = SerializeToClustermap( - bgc_order=self.bgc_order, sg_object=self.sg_object, link_df=self.link_df + sorted_bgcs=self.sorted_bgcs, sg_object=self.sg_object, link_df=self.link_df ) cmap.write(outpath=outpath) @@ -537,7 +609,7 @@ def user_friendly_hit_df(self, truncate_description_to_n_chars=20): "q_description": i.description, "q_external_id": i.external_id, } - for i in self.input_assembly.loci[self.input_bgc_id].features + for i in self.input_bgc.features ] ) db_res = pd.merge(db_res, query_df, on="query", how="left") diff --git a/socialgene/search/hmmer.py b/socialgene/search/hmmer.py index ee64a3c0..69bffedc 100644 --- a/socialgene/search/hmmer.py +++ b/socialgene/search/hmmer.py @@ -1,4 +1,5 @@ import asyncio +import concurrent.futures import re from math import ceil from pathlib import Path @@ -18,6 +19,7 @@ from rich.table import Table from socialgene.base.socialgene import SocialGene +from socialgene.compare_proteins.hmmer import CompareDomains from socialgene.config import env_vars from socialgene.neo4j.neo4j import GraphDriver from socialgene.search.base import SearchBase @@ -33,11 +35,51 @@ ) -async def _find_similar_protein( +def _find_similar_proteins( domain_list, frac: float = 0.75, only_culture_collection: bool = False ): """ - The function `_find_sim_protein` is an asynchronous function that queries a Neo4j graph database to + The function `_find_similar_proteins` is a synchronous function that queries a Neo4j graph database to + find similar proteins based on protein domain information. + + Args: + protein_domain_dict (Dict[List[str]]): The `protein_domain_dict` parameter is a dictionary where + the keys are protein uids and the values are lists of domain uids. + frac (float): The `frac` parameter is a float value that represents the fraction of protein + domains that need to match in order for a protein to be considered similar. By default, it is set to + 0.75, meaning that at least 75% of the protein domains need to match + + Returns: + The function `_find_sim_protein` returns a Pandas DataFrame containing the results of the query + executed in the Neo4j database. The DataFrame has columns `assembly_uid`, `nucleotide_uid`, + `target`, `n_start`, and `n_end` + """ + # TODO: move async driver to reg driver class module + # with GraphDriver() as driver: + with GraphDriver().driver.session() as driver: + res = driver.run( + f""" + WITH $domain_list AS input_protein_domains + MATCH (prot1:protein)<-[a1:ANNOTATES]-(h0:hmm) + WHERE h0.uid IN input_protein_domains + WITH input_protein_domains, prot1, count(DISTINCT(h0)) as initial_count + WHERE initial_count > size(input_protein_domains) * $frac + MATCH (n1:nucleotide)-[e1:ENCODES]->(prot1) + {'WHERE (n1)-[:ASSEMBLES_TO]->(:assembly)-[:FOUND_IN]->(:culture_collection)' if only_culture_collection else ''} + MATCH (a1:assembly)<-[:ASSEMBLES_TO]-(n1) + RETURN a1.uid as assembly_uid, n1.uid as nucleotide_uid, prot1.uid as target, e1.start as n_start, e1.end as n_end + """, + domain_list=list(domain_list), + frac=frac, + ).to_df() + return res + + +async def _find_similar_proteins_async( + domain_list, frac: float = 0.75, only_culture_collection: bool = False +): + """ + The function `_find_similar_proteins_async` is an asynchronous function that queries a Neo4j graph database to find similar proteins based on protein domain information. Args: @@ -68,7 +110,7 @@ async def _find_similar_protein( WITH input_protein_domains, prot1, count(DISTINCT(h0)) as initial_count WHERE initial_count > size(input_protein_domains) * $frac MATCH (n1:nucleotide)-[e1:ENCODES]->(prot1) - {'WHERE (n1)-[:ASSEMBLES_TO]->(:assembly)-[:FOUND_IN]->(:culture_collection)' if only_culture_collection else '' } + {'WHERE (n1)-[:ASSEMBLES_TO]->(:assembly)-[:FOUND_IN]->(:culture_collection)' if only_culture_collection else ''} MATCH (a1:assembly)<-[:ASSEMBLES_TO]-(n1) RETURN a1.uid as assembly_uid, n1.uid as nucleotide_uid, prot1.uid as target, e1.start as n_start, e1.end as n_end """, @@ -82,8 +124,10 @@ async def _find_similar_protein( sema = asyncio.BoundedSemaphore(5) -async def _find_similar_protein_multiple( - dict_of_domain_lists, frac: float = 0.75, only_culture_collection: bool = False +async def _find_similar_proteins_async_multiple( + dict_of_domain_lists, + frac: float = 0.75, + only_culture_collection: bool = False, ): # create task group # TODO: if webserver in future this could be used to control max time of search @@ -92,7 +136,7 @@ async def _find_similar_protein_multiple( # create and issue tasks tasks = { k: group.create_task( - _find_similar_protein( + _find_similar_proteins_async( domain_list=v, frac=frac, only_culture_collection=only_culture_collection, @@ -106,11 +150,47 @@ async def _find_similar_protein_multiple( return pd.concat([v.result().assign(query=k) for k, v in tasks.items()]) +def _find_similar_proteins_sync_multiple( + dict_of_domain_lists, + frac: float = 0.75, + only_culture_collection: bool = False, +): + results = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=20, + ) as executor: + futures = {} + for k, v in dict_of_domain_lists.items(): + future = executor.submit( + _find_similar_proteins, + domain_list=v, + frac=frac, + only_culture_collection=only_culture_collection, + ) + futures[future] = k + for f in concurrent.futures.as_completed(futures): + result = f.result() + result["query"] = futures[f] + results.append(result) + return pd.concat(results) + + +def run_search( + dict_of_domain_lists, frac: float = 0.75, only_culture_collection: bool = False +): + for k, v in dict_of_domain_lists.items(): + _find_similar_proteins( + domain_list=v, + frac=frac, + only_culture_collection=only_culture_collection, + ) + + def run_async_search( dict_of_domain_lists, frac: float = 0.75, only_culture_collection: bool = False ): return asyncio.run( - _find_similar_protein_multiple( + _find_similar_proteins_async_multiple( dict_of_domain_lists=dict_of_domain_lists, frac=frac, only_culture_collection=only_culture_collection, @@ -118,7 +198,17 @@ def run_async_search( ) -class SearchDomains(SearchBase): +def run_sync_search( + dict_of_domain_lists, frac: float = 0.75, only_culture_collection: bool = False +): + return _find_similar_proteins_sync_multiple( + dict_of_domain_lists=dict_of_domain_lists, + frac=frac, + only_culture_collection=only_culture_collection, + ) + + +class SearchDomains(SearchBase, CompareDomains): """ Class search for similar BGCs in a SocialGene database, using domains Args: @@ -158,7 +248,7 @@ def __init__( self._set_hmm_outdegree() self._get_outdegree_per_hmm_per_protein() - def search(self, **kwargs): + def search(self, run_async=True, **kwargs): dict_of_domain_lists = ( self.outdegree_df.groupby("protein_uid", observed=True)["hmm_uid"] .apply(list) @@ -172,13 +262,22 @@ def search(self, **kwargs): ) as progress: task = progress.add_task("Progress...", total=2) progress.update(task, advance=1) - self.raw_search_results_df = run_async_search( - dict_of_domain_lists, **kwargs - ) - self.raw_search_results_df["query"] = self.raw_search_results_df[ - "query" - ].astype("category") - progress.update(task, advance=1) + if run_async: + self.raw_search_results_df = run_async_search( + dict_of_domain_lists, **kwargs + ) + self.raw_search_results_df["query"] = self.raw_search_results_df[ + "query" + ].astype("category") + progress.update(task, advance=1) + else: + self.raw_search_results_df = run_sync_search( + dict_of_domain_lists, **kwargs + ) + self.raw_search_results_df["query"] = self.raw_search_results_df[ + "query" + ].astype("category") + progress.update(task, advance=1) log.info( f"Initial search returned {len(self.raw_search_results_df):,} proteins, found in {self.raw_search_results_df.assembly_uid.nunique():,} genomes" ) @@ -270,8 +369,8 @@ def prioritize_input_proteins( max_domains_per_protein: int = None, max_outdegree: int = None, scatter: bool = False, - bypass_locus: List[str] = None, - bypass_pid: List[str] = None, + locus_tag_bypass_list: List[str] = None, + protein_id_bypass_list: List[str] = None, ): """Rank input proteins by how many (:hmm)-[:ANNOTATES]->(:protein) relationships will have to be traversed @@ -281,27 +380,27 @@ def prioritize_input_proteins( max_domains_per_protein (int): Max domains to retain for each individual protein (highest outdegree dropped first) max_outdegree (int): HMM model annotations with an outdegree higher than this will be dropped scatter (bool, optional): Choose a random subset of proteins to search that are spread across the length of the input BGC. Defaults to False. - bypass_locus (List[str], optional): List of locus tags that will bypass filtering. This is the ID found in a GenBank file "/locus_tag=" field. Defaults to None. - bypass_eid (List[str], optional): Less preferred than `bypass`. List of external protein IDs that will bypass filtering. This is the ID found in a GenBank file "/protein_id=" field. Defaults to None. + locus_tag_bypass_list (List[str], optional): List of locus tags that will bypass filtering. This is the ID found in a GenBank file "/locus_tag=" field. Defaults to None. + protein_id_bypass_list (List[str], optional): Less preferred than `bypass`. List of external protein IDs that will bypass filtering. This is the ID found in a GenBank file "/protein_id=" field. Defaults to None. Returns: pd.DataFrame: pd.DataFrame({"external_id":[], "hmm_uid":[], "outdegree":[]}) """ log.info("Prioritizing input proteins by outdegree") loci_protein_ids = set() - if bypass_locus: + if locus_tag_bypass_list: # bypass using locus tags loci_protein_ids = { i.uid - for i in self.input_assembly.loci[self.input_bgc_id].features - if i.locus_tag in list(bypass_locus) + for i in self.input_bgc.features + if i.locus_tag in list(locus_tag_bypass_list) } - if bypass_pid: + if protein_id_bypass_list: # bypass using an external protein ids loci_protein_ids.update( { i.uid - for i in self.input_assembly.loci[self.input_bgc_id].features - if i.external_id in list(bypass_pid) + for i in self.input_bgc.features + if i.external_id in list(protein_id_bypass_list) } ) len_start = self.outdegree_df["outdegree"].sum() @@ -357,6 +456,7 @@ def prioritize_input_proteins( threshold = ceil(n_input_proteins * max_query_proteins) else: threshold = max_query_proteins + m_start = self.outdegree_df["outdegree"].sum() if scatter: temp = list( self.input_assembly.loci[ @@ -372,7 +472,6 @@ def prioritize_input_proteins( | self.outdegree_df["protein_uid"].isin(loci_protein_ids) ] else: - m_start = self.outdegree_df["outdegree"].sum() log.info( f"'max_query_proteins' is set to {threshold}, will limit search to {threshold} of {n_input_proteins} input proteins" ) @@ -449,9 +548,9 @@ def _outdegree_table_stats(self): d = [ { "protein_uid": i.uid, - "desc": f"{i.external_id} | {i.locus_tag if i.locus_tag else 'no-locus-tag'} | {i.description}", + "desc": f"{i.external_id} | {i.locus_tag if i.locus_tag else 'no-locus-tag'} | {i.description}", } - for i in self.input_assembly.loci[self.input_bgc_id].features + for i in self.input_bgc.features ] d = pd.DataFrame(d) temp = pd.merge(temp, d, on="protein_uid", how="left") diff --git a/socialgene/utils/logging.py b/socialgene/utils/logging.py index d5006066..c73840f3 100644 --- a/socialgene/utils/logging.py +++ b/socialgene/utils/logging.py @@ -1,8 +1,6 @@ import logging import sys -from socialgene.config import env_vars - # If {rich} is installed use it, otherwise.... don't try: from rich.console import Console @@ -11,7 +9,7 @@ c = Console(width=150) # https://rich.readthedocs.io/en/stable/logging.html logging.basicConfig( - level="NOTSET", + level=logging.INFO, # format="%(filename)s/%(module)s/%(funcName)s\::: %(message)s", format="%(message)s", datefmt="%Y-%m-%d %H:%M:%S", @@ -25,7 +23,7 @@ except ImportError: log = logging.getLogger(__name__) -log.setLevel(env_vars["SOCIALGENE_LOGLEVEL"]) +# log.setLevel(env_vars["SOCIALGENE_LOGLEVEL"]) # handler = logging.StreamHandler(stream=sys.stdout) # log.addHandler(handler) diff --git a/socialgene/utils/ncbi_ftp.py b/socialgene/utils/ncbi_ftp.py index 56f66d67..70487619 100644 --- a/socialgene/utils/ncbi_ftp.py +++ b/socialgene/utils/ncbi_ftp.py @@ -9,7 +9,7 @@ class NcbiFtp: def __init__(self): log.debug( - f'Connected to "{self.ftp.host}" on {self.ftp.port};\n\n NCBI FTP info: {self.ftp.welcome.replace("220","")}' + f'Connected to "{self.ftp.host}" on {self.ftp.port};\n\n NCBI FTP info: {self.ftp.welcome.replace("220", "")}' ) self.ftp.set_pasv(True) self.assembly_paths = [] diff --git a/socialgene/utils/run_subprocess.py b/socialgene/utils/run_subprocess.py index 06f43566..55b90de6 100644 --- a/socialgene/utils/run_subprocess.py +++ b/socialgene/utils/run_subprocess.py @@ -12,6 +12,7 @@ def run_subprocess( check=True, shell=False, capture_output=True, + status=False, **kwargs, ): """Run something in a separate process @@ -31,11 +32,21 @@ def run_subprocess( command_list_string = " ".join(command_list) else: command_list_string = command_list - log.info(f"Executing external program:\n{command_list_string}") - with console.status( - "", - spinner="bouncingBar", - ) as status: + if status: + log.info(f"Executing external program:\n{command_list_string}") + with console.status( + "", + spinner="bouncingBar", + ) as status: + result = subprocess.run( + command_list, + check=check, + shell=shell, + capture_output=capture_output, + **kwargs, + ) + else: + log.debug(f"Executing external program:\n{command_list_string}") result = subprocess.run( command_list, check=check, @@ -43,8 +54,7 @@ def run_subprocess( capture_output=capture_output, **kwargs, ) - if result.stderr: - log.error(f"Error code: {result.returncode}") - log.error(f"Error stdout: {result.stderr.decode('utf-8')}") - raise SystemExit - _ = status + if result.returncode != 0: + log.error(f"Error code: {result.returncode}") + log.error(f"Error stdout: {result.stderr.decode('utf-8')}") + raise SystemExit diff --git a/tests/python/classes/test_mibig_fasta_parser.py b/tests/python/classes/test_mibig_fasta_parser.py index 7b35d7d0..ca2fbbce 100644 --- a/tests/python/classes/test_mibig_fasta_parser.py +++ b/tests/python/classes/test_mibig_fasta_parser.py @@ -17,124 +17,146 @@ def test_fasta_file_parse(): sg_object = SocialGene() gbk_path = os.path.join(FIXTURE_DIR, "lagriamide_mibig_bgc0001946.gbk") sg_object.parse(gbk_path) - sg_object.write_fasta(outpath=fp.name) + sg_object.write_fasta(outpath=fp.name, external_id=True) fasta_object = SocialGene() fasta_object.parse(fp.name) protein_parse_results = { - k: [v.description, v.external_id, v.domains] + k: [v.description, v.external_id, v.domains, v.sequence] for k, v in fasta_object.proteins.items() } assert protein_parse_results == { "Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl": [ - "Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl", - "Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl", + "AXA20086.1", + "AXA20086.1", set(), + "MQEYCRLRRKLTLELSPEDAADVAQEAFERTLRYMRKHDGRVASPVGLLVRIALNLQIDRGRRRKHLPTALDESWEHPRWDITPEDEVVGRQSVTQLVETLDKLAPRRREAFVLCRLHGLTYQDAAKKMGIRPSVVREYLVDAVRACRDSVDWAVSRVKCNTPLVNGLELSRSG", ], "ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn": [ - "ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn", - "ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn", + "AXA20087.1", + "AXA20087.1", set(), + "MAPTGIILGDTNGLRISLVCNPFVSPKIMPVGAIASSRSIYATIPLAMDGPVYWILWTDNPSNR", ], "-l7xLyFZbiZENPLq_GML8JyTRF1Srawr": [ - "-l7xLyFZbiZENPLq_GML8JyTRF1Srawr", - "-l7xLyFZbiZENPLq_GML8JyTRF1Srawr", + "AXA20088.1", + "AXA20088.1", set(), + "MSEYIHKSHNVTVLTYHVVLWRGIEKQSLMTGLAKY", ], "T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0": [ - "T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0", - "T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0", + "AXA20089.1", + "AXA20089.1", set(), + "MSDLPPRFAKSAAVERRFSQTKQGCNAGFARAMPGLVSFFTPQFADMNKLIDYWRATVACGRMGEGMSERGRHTEFSVVLLRNRDSAPVFGVVLRA", ], "AStsOnOU5ZWxURs9PrTiWjddkuQXfanl": [ - "AStsOnOU5ZWxURs9PrTiWjddkuQXfanl", - "AStsOnOU5ZWxURs9PrTiWjddkuQXfanl", + "AXA20090.1", + "AXA20090.1", set(), + "MQSVNPIGETKKALRLLQELVGRELSLPADEVPVDTDYLNMGFSSRSLIGLIQQLSSLLRTKLNPSVLFDYRTLTEFADYLAEHHAQWLTAIEREQPAERNQEAGDETSVAVPLSEAQRGLWFLQKNHPWMAAYNVPLCLRLSPRVQRERMRQACAWLPRRWPVLGASVQRADGRLVMQTQPARKLTWQEHHAEDWSEAQRLAWLGDRLAEPFDVDNGPLLRAYWLGGEPDGASRLLLVIHHLVIDAVSVGVLLAGLRKTYADLEGWRDLSGAVDDSAYGAFVAAEAERLAGAEGFARLAYWREQLADVPGSLGLPLDRARGVTPSFKGCTLRRELQAALGESLRAYTERHRVYPSTLLLAVFQGLLSRHAGRDDVVVGMAMDERDAASAGLVGMFVNMLPMRARGLGRRGFVEDMQALQRQLVDAMAQAYPFPALVRELGLSGSDASPLFQAAFLYQDTLDIDVLNGVDDWVWEEALYQEGEYELVLEVRRRLKGYALYFKYDPTLWDKSTIERWLAHYLRLLEGVLAAPQKRLGEHELRGEHERARLAALQGEVRDWALTPVSILFERQAIANAQAWAVSDDQQRWCYAELAAHSAAIAQRLHVQGIGTGSIVGVCQGRSPWLLASLLGIWQAGAAYIPLDPAYPVERLRYMLEDSGASAVLSDTSHLVLVQALAGMLPVWAADAAAPSTSAPSFPPPQPEDLAYVLYTSGSTGRPKGVRISHGALSNFLQSMAEKPGLTAGDRLLAITTISFDIAGLELYLPLIGGGECVLCPEEIVRDARRLKAEVEHVRPTLMQATPATWSMLFHAGWRNAEELKVLCGGEALPARLKQRFDEIGTTVWNLYGPTETTIWSTLAKLDAEDTSIHIGQPIANTQAYVLDQEGREQPIGIEGELYLGGAGLAQGYHGQPERTAQAFIDHPLGRLYKTGDLARWRADGQLEYRGRSDQQVKVNGHRVEPGEIEAVLEQSGLVKQAAIVLREGAHGSQLAAWCVPTKVTQGDTWLDPVQIQVLQAQLRDRVPAYMQPSIWLGTAALPKTLNGKIDRQVLSARALPEQREEKAPPSAVTRSAARRALESRLQALWAQVLERSTVSRDERFMEAGGNSVAAVLLAERIQAEFGRAFGVAQVFAYTSVAAQAAYLDASDLTFEALANEPIATQVIVAEPERDASGVAEDALAIIGIACHFPGAEDHRAFWNNLRAGHDSGKLFSPEALRAAGVPERLIADPHYVPIRYGIEGKAEFDADFFNLSPRAASLMDPQYRLLLQQAWAAIEDAGYTPEVIPDTGVFMSASFSAYQARLQDPSAVEAGDRYVAWLMSQGGSLPTLISYHLGLTGPSLFIQTNCSSFLSALAAARSSLLARESRLALVGAATLFTEDSKGYLHEPGLNFSSDGHCKTFDASADGMVSGEGCGVILLKQAREAVADGDHIYALLRGIAVNNDGADKAGFYAPSVRGQSEVIEQALRQARIDPGQIGYVEAHGTGTRLGDPIEVTALSETYRRHTQATQYCAIGSVKSNLGHLDTAAGLAGCIKLALSLQHGEIPPTLHYQQPNPAIDFAASPFYVAEHLQAWPPGPRLAGLSAFGIGGTNTHAILEAYLEESVAARVDGAQLIVLSARTQERLHAVERQLLDHLDSSASLPSLRDLAYTLQVGRRGMTHRLAFVVEDVSALRRQLSDHLAGRKVGHEGDCDRGHAVLHAFENDGDSARLFAQWAAAGKLDKLAALWVQGLSLDWALLHGGQRPRRVSLPTYPFERTRHWLEVPLAASAVSPESPELATERSWDGMSYLPRWVATPATAPDVEEVPAPASLLIVAPEASARADELFEWCTRRWPSAALRLIRLGRENLWLGEAERVCDSFDDGQALTEALREGTWPDSVVFLAEEAGSTLNWPGHPESAEAQLLRVQQALSQSQPAQRIEFYLVTIEANASDALTGGGLSGLAYALAQGDHRLRLRHLSLDADVWSASSWWQLWAEPASDRGERVRFSGGERWRQRFDRLNWGSLRDGGLKQGGVYLIAGGSGTLGIAISRYLIERYRARVIWLGRSRADAPELVARRQVLEAGALLPGYVQADLTDATAVHEAVARARQIHGTIDGAIFSAMYRGADAPAERWSPTALAGAVAVKALGARHFYQALLGESLDFLCYYSSVQSFAFLSARNSAAYAVGVAAADRYVQLIRAQSPFRVGLIHWGYWQDTLAGTALEQDLARHFTGIHADEACVFFERFVAALGQGMLDQAICLKASDTVRDVMPSTVDDVVVLTTAGTPSFLDGFDAAALRSTVMPPDWTELDRRLRGLLCAQLRYLGLFDQAGVSWESEQLRRKLGVIDDYRRWWDECCTEMLEKEGWIRTRKGRVELLQALSLEQADALWQDWEHDKADYLKDPQLRAALLLVEDCLRHLPAVLLGRTSATSVIFADGSMSKVAGLYQGTAWTDSFNHQVADTVEVYVRHRLAADPLARLRVLEVGAGTGGTTAKVLPRLAAIGAPMLEYCYTDLSEAFLAQARERHLANYPYLRTCRCDIERPLAEQGIVPGSYDIVIATNCLHATHDIRATLRHVQAALRCHGVLIANEGVSKSLLGTLTFGLLEGWWLYDDPQLRIPGSPLLDSAHWRALLDEAGMRPVCLDGPGRELQQVWVAQSRGLIRPGGFASASATRLPAPSSVPAKAGVAKPVVSASPVSSSEVVPYAAIAAEIRACLAETLKREAAGLADETAFSDYGIDSILSVALVKRLNQRLGVQLGQAVIYDYSTIAALSRHVLERRDGQAAMVPRQFGVVAASESTVVEVPVLAPVTVSEAVPPAAEAARPTPIAVIGMALQVPDAEDADTFWANLLSGHDSIRELPEAYGRPAHGASPRGGALQGRDYFDAAFFGLSNEEAAGMSPAQRLVLEEGWKALEDAGYDPRSLRGSRTAVFVGAEPSGFFQGSFSGSSEAIIASRLSYLLDLKGPALVVNTGCSSGAMAIHLACESLRRGEIQLALSGGVASALSVEGLRHLADAGMLSPQGHCLSFEATGDGMVLSEAVGMLVLKRLDQAIADGDAIHGVIAASGSNQDGTSNGITAPNGRAQEDLLGEIWARHGIAPEHISHFEVHGTGTSLGDAVEGNAISRAFGRVTARRGFCVIGTSKTHIGHTGAASGVVGLIKLLLSLRHRQLPGLLHFERANPLIDWNASALRLPEATTAWEGEPDLPRYAGLNAFGHSGTNVHMVVREYGPGEGDQRGPQLLECAQEVLLPLSAASALQLARSARRLLDFLDEEAGHGQSRPSLAELAYTCQTGRSALVERAVLKAGDREQLRALLLALAEQRPMAGLWRGSVDPETTLVAAANRDGLDELAESWIRGAEVDWQKLYGPVRPRRCHLPAYPFDRRHFPWHLAASVPSPVSRHRVPVATEPIATLTTVTAMPAWRFVLAATAEAGAEPRAQATQWLCRWLAMRLQRPVQALNPQLSYRELGLTSLGLVALSEELSRLLGVVVLPSLLFEYPSIASLAAHLAEQHAALLSRVQSIPLTGADSPSVGTPAEASVPDRVLSVLQAYRNGALNHAQARTLLAETTP", ], "IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ": [ - "IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ", - "IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ", + "AXA20091.1", + "AXA20091.1", set(), + "MNAFMTLDQMIALYKAGKVSVDQVKEAFARLPDATADTGIVTDVELSETQRGLWSLQKAYPWLAAYNVPLCLRLPADLDRERFLRACAGLLERWPVLGASVEQQDGPLRLRMASVSGLSLEQDDGLAWSESERLDWLRERIEQPFDLAKGPLLRAHWLEGSGGGESLFLLVVHHLVVDGASVGLLLAGLHEAYRALGNGEAMPSASGSDGYLGFVQAERVRLQGDQAARRLAYWREQMADAPSSLGLPLDRPRSATPSFKGRTLRRELLSALGDSLNAYTEYHGVYPSTLLLAVFQGLLSRHAGCDDVVVGMAIDERDAASAGLVGMFVNMLPMRARGLGQRGFGEDVQALQRQLVDAMAQAYPFPALVRELGLSGGEASPLFQAAFLYQDMHDMLDTEALAEVSKWTWEEALYQEGEYELVLEIRRRAQGYVLYFKYDPMLWDESTIERWLAHYLHLLEGVLADPRKRLGEHELRGEHERAWLAAWQGEVRDWALTPVPALFEHQVAANAQAWAVSDDQQRWCYAELAAYGTAIAQRLHAQGIGSGSIVGVCQGRSPWLLASLLGIWRAGAAYVPLDPAYPVERLRYMLEDSGASAVLSDTLHLAQVQALAGILPVWAADAIGPLTSVPLPTLQPEDLAYVLYTSGSTGRPKGVRISHGALSNFLQSMAEQPGLMAGDRLLAITTISFDIAGLELYLPLIGGGECVLCPAEIARDGRRLKAEVERVRPTLLQATPATWSMLFHAGWHNAERLKVLCGGEALPVRLKQRFDEIGTTVWNLYGPTETTIWSTLAKLDAEDTSIHIGQPIANTQAYVLDQEGREQPIGIEGELYLGGAGLAQGYHGQPERTAQAFIDHSLGRLYKTGDLARWRADGQLEHRGRSDQQVKVNGHRVEPGEIEAVLEQSGLVKQAAIVLREGAHGSQLAAWCVPTKGTHEDTWLDPVQIQVLQAQLRDRVPAYMQPSIWLGTAVLPQTPNGKVDRRVLCARALPAQREANTQPPSATPRNAVRQELESRLQALWIEVLERPMVDRDERFMEAGGNSVAAVLLAERIQAEFGRVFGVAQVFAHTSVAAQAAYLDEVQPVSTPGETTDSSAPKPDTPETVLAEDTLAIIGIACHFPGAEDHRAFWENLRAGHDSGKLFSPEALREAGVPEPLIADPQYVPVHYGIEGKAEFDAEFFNLPPRTVTRMDPPFRLLLQHAWAAIEDAGYTPEAIPDTAVYMATGGPLRLAELTEPGSPGDSDDYVNWLLAQPGTVATMVSYQLGLRGPSYAVHANCSSSLVGMHAAAQSLRSGEVRTALVGAASLFGDDSLGYRYEPGLNFSSDGHCKPFDLRADGLVAGEGVAVVLLKRAREAVADGDHIYALLRSVAVNNDGADKAGFYAPSVRGQSEVIEQALRQACIDPDQIGYVEAHGTGTRLGDPIEVTALSETYRRHTQATQYCAIGSVKSNLGHLDTAAGLAGCIKLALSLEHGEIPPTLHYQQPNPAIDFAASPFYVAEHLQAWPPGPRLAGLSAFGIGGTNTHAILEAWPPATARLLQSEPAPGVAQVFPLSAMQADRLPVYARRLATFLRGPYAASLRLADIAYTLQTGRRSMRSRCAFVAETLDQLLAALDDCADTVDTQDAAAASPPAPADGWQLQRDAHGLAAAWQQGEPIDWNALQAVAPWSARRVSLPTYPFSPKRWPRTAARVAASTQAAVLHPLLHRNTSDFAAYRYSARFGGDEFFLADHVMGGHKVLPGVAHLEMACAAFAQAIAAPQAALELRDVVWIRPVGVDAPLDVHTVLRPQADGSADFEICSQPDAAGERIVHSQGRVMPVDTVESEAELLALDTLRQQLAQAHRDAASYYRDFEQGGASYGPAFRALEQLWLGDGQALARLVLPAARHEGAERYGLHPSLLDAAVQAAFIGINALRAAAGVASSEQGGSLPFELKRVQLLVPCEPVMWAWVRYTQGIGAGERVQRMDVDLCDEQGRVRVRLHGLARLAALVPKLAAPSLQLPQWEDAPLVYGADAPPAYAQRLLVLCGLPAHGLEAESGADLVERLEAGTGLDSVQDYALRLLARVQVFMRDKPRGRCLLQVVIPAKGEMVSLAALAGLVRCLRLEHPGVSAQLIAVDPAIAAPALAALLREEGREAAEAQIRHLHGRRQRRALVPLLPSAATTVSQPWRKGGVYLITGGAGGLGWLFAQEIAARCEGAGIILLGRSALAPGQATRIERLGREGTRVVYRQADVTDAAAVEAAVQAARTLGPLRGVLHAAGVLRDGLLLNKSEEQARAVLAPKLTGSLVLDRATRELDLDFFVLFSSISALGSVGQSDYAVANAFLDEFAHWRATAVARGDRRGRSLAIAWPLWAEGGMRPAVAALAGMPEAWRSVAMPSAQGLAYFYRALASVYAQVVVPSLAQAAPAEATSLVAGAVVASTVILPTAPAAPSSTPSTHKASMASAAVSMIDVEAFAARLRGVVLGLVTQLLGVPTEQVDLDEEFNAFGFDSISLTNLANRLNQELRLNLTPAIFFEHASVNRFVEHLLAEHADRLDFLVQTAAPARAEEAKSMPTVTSAVAATASDDIAIIGMSARLPMAVDLDEFWDNLLAGRDCIGEIPADRWDWREIYGDPLREPNKTDAKWGGFIDGVADFDPLFFGISPREAEAMDPHQRLLMSYVWQALEDAGYTRQALSGSDTGVFVGLGGSDYGQRVVAADGGVESHTLMGLLPSMAPARMSHFMNWHGPSEFIDTACSSSLVAVHRAAQAIAAGDCSLAVVGGVMAILSPTSHIGFSKAGMLSKDGRCKTFSSEADGYVRGEGVGMLVLKRLSAARADGDCIHAVIRASVVNHGGRANTLTSPNPSAQAELIESACRKAGVAVEQIGYIEAHGTGTRLGDPIEINGLKSAFKTLGHPMRGAWCGLGSVKTNIGHLELAAGIAGLLKLVLQMRHRTLVASLHCEQVNPYIELEGSPFYLVRENRPWPTGRDAFGRPAPRLAGVSSFGFGGVNAHVLVQEHLADAPPEEDEASVLIVLSARSENALRGRARDLLVYLERRGLDRSTASVYDNRRTELEQHIRLALAGLLDVDVHEISIDESFEAYGLDALARNRLAAALDESSPALLAATLRAGSVAHLADEWLASQGYAPVSVASVESSIRLGDLAYTLQVGREPMEHRLGFIAASFAQLNERLRAFLDGQDGMHELHRGQAKFGKNGLSLLGADDDMEGTIASWVVKKKYALMLKLWVNGMELDWRLLHVGRQLQRISLPAYPFDYRRCWVGETPAAARETDKAAGASEPPALDVALDETSGEVNQAESAQFLLPVWDAVHPELLADTPPQGRILLYGGDQGMVDAWRGGAVGLATLVVTPGASIEALSAQLAAAGSICEVLFVAPGEADVDPGNAQLIDAQQQGVLGLFRLVKALLATGHGLRDLAWTVVTTQAVDLGDGLPVRPAHAAIHGLAGALAREYPHWRLHLVDVAVDEADDWRNWRRLPPEENGATWCRRGGEWFRQSLLPYQPDTTNLPLPYRQGGIYLVIGGAGGLGEAWSRHMIERYQAQLIWLGRSAPNARIGERIDALAALGPVPRYIQADAGDEAALRRAAAEIAREFPRLDGIVHSALVLADQGLASMSEAEFSGALAAKVDTAVLSTRVFTNQPLDFVLFFSSMMSFGRAAGQSNYAAGCMFADAYARRLAQLARYPVKIVNWGYWGSVGIVSRADYQARMRRAGIGSIEPAEGMRALEVLLRAPVPQLGFVKTRIGVGGDSLRQYADTIPAVTARVMLDTPVPALRREDLLFADQRLESLVRGLLARQLETLGEHPVLPLYQRWLAESRRVLAEVPPPATTVEALWRAWDKHALTRRHLDGSTPELDLLRNCLQALPAILLGQRTATEVLFPQSSLSLVEAIYRDNPVADLFNGRLQQWVLAYLRERLARDPRAQLRILEIGAGTGGTSAGLLAALRPVHEHIAEYSFTDVSRAFLIQAQERFSVGFPSLVTRLFDVEQPLAAQDLPADYFDLVVAANVLHATSEIRTALRNAKAALRKGGVLLLNEIAGQSLFTHLSFGLLEGWWRYRDPAVRLPASPALSPKNWARVLREEGFGEIDFLLKERLDLGQQLVVAQSNGLVRQAEARGGGAEKGRHGSVVVSNTAFVATQARPAAPAVPPQAVAHATVEEPVAVDAVVHRVLLEELARTLKLDSGEIDTQMPFSDYGIDSILGVGLVKQFNDRLGLNLNTTILFEHSSLERLVVHILQRHRREAITALGLVVGPVLGAAEPAPPPAPAVAASCESGRASGAVDELIPAPEVELCRAETGPLEIAVIGMSAQVPGADDAETFWHNLMHGVDGVTQLPAAYLDRARQGDDKQAYYLWGGTLAARAAFDPQFFSISPREAKSMNPHQRLVLQESWKALEDAGYDPRSLAQRRVGSFVGAEPSGYFHETFTGSSDAIIASRLAYFLDLKGPALVINTGCSSSATAIYLACESLRHGESELALAGGVYAVLNETGLVSLAQLDMLSNSGRCHSFDAAADGTVFSEAVGMVVLKRMDDALRDGDPIYASIVASGINQDGASNGITAPSGEAQEQLLLETYRRFSIDPRHIGYVEAHGTGTRLGDPVEANALLRAFRAHTTDRDYCAVGSAKAHIGHTAAASGVIGLVKLLLSLKHGRTPGLLHFRELNPLIEWQDSPFFIPTANRDWPVESGRPRMAALNSFGHSGTNVHLVLKEYVDPVEAADDAQQPGWHLLPLSATNETRLRAYAGKLARYLREAGEGVNLGDVAHTLQHGRIALKRRWCLLVANAAGAIAQLQAYADGADVAELAGVHGLADGTLVTDRIEAAATPELLAALARRWASGAMVDWPLAGPTARRIHLPSYPFARDHYWMDGGALAATTVRTVTAAPRVPIMHPLLHARVDTATGPRFSAQLSGEEAFFRDHRILGRPTFPGAAYLEMACAAAARVLGTEALRLSAVVWSRPLQASAGGVALGIELQARDGDGCWRFEIAAAGQVHCQGLAAKAEARENAFERQDLGALLARMDRQGVGHDDCYAAFDAVGIHYGPSHRAIAHLHLGHDEVLARLVLDPAAALDSEATAAAYRLHPSLLDAALQATLGLSVGSARGGAYVPFALERLVLCASTGFQLWAWVRYTAGSRADEAVRKYDIDLLDDQGGLIARLLGFAFRPVEAVAPASADPVLLWCRDWKPDPVPLVGDSAPVRLSVLVSDRAAWRFDEPTDGSLLAHLQIDSIQEAPDQWYGRVLRGLFELIKAQAQKTVQACLLQVLVPAWGRAALLSGLSGLLRTATLEFPRLSTQLLAFDAPPVGLSALLEANARQPWHAQLRYLDGTRLAPAWAEVAGSTGAQMSALPWREHGVYLLTGGLGGLGRLLAEEILGQTPDATVVLCGRAAPDAEAAQWLRTTGGNGRLRYRRADVADAAQVREMVADIVVCCGTLHGVFHSAGVLRDSYLPGKTSAQIDAVLAPKVAGTVNLDEATQSLHLDLFVLFAAGAGALGNPGQADYAAANAFLDAYAGWRNEQAAQGRRRGQAVAFDWPLWAEGGMRISDSQREAIEAANGMRALERDEGIRALYRGIASGEAQLLVGVGDHARLRAWIQHLHAEPTAPAARSAQTGTVVPAGTVRALRERTRQFLIDSAASILELQPQDLDPRDELSDYGFDSITLTELTHRLNRQFELELVPTVLFEHPTIAKLTAHLLESFPEALARAFPEDVAGPPPRATQAPLATPERDMGEMRAADALRNTNTNTHAGAVAIIGMSGRFPGAPDLASFWQVLAEGRDCIGEIPADRWDWRQYYGDATQDGRHTRVKEAGFIEGVAEFDPLFFGITPREAGLMDPQQRLLLTHAWAAIEDAGYAPSSLAGGRTAIFVGTAPSGYSSLIEQAAMGLDGHSSTGSVASLGPNRLSYLLDLHGPSEPVETACSSALVAIHRAVRAIRHGDCETALAGGVNTIVLPEVHISFDKAGMLSPDGRCKTFSRHADGYGRGEGVGILLLKDLAAAERDGDAIHAVIRSTVENHGGRASSLTAPNSKAQTELIKAAVRDAGIEADSIGYIETHGTGTALGDPIEINGLKAAFEELQAEAGGAAPIPDSCALGSVKTNIGHLELAAGVAGVIKVVLQLRHRTLARSLHAEEPNPYIRLAGTPFYLLGETRPWPMPRDAGGRPLPRRAGVSSFGFGGVNAHVLLEEYVGSRAELPASEPRGPYLFVLSARQPARLVEQAAQLVAFLRDETRPALAELIYTLQVGRDAMEERAAFLVADYAELTQRLAEFVAGRAGAWLHRGRARHDSVAATLFDDQNEQREAVRVWLAQGRHDKLLGLWTQGLEVDWRVLHAGPPPRRAHLPTYAFARERYWPPRPDRPAARPETRVNPLMMQEEAANEDELDRHSALLDQLIHRQISADEAVRLARRQEQL", ], "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5": [ - "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", + "AXA20092.1", + "AXA20092.1", set(), + "MTDHVAEIYHQVASGQLSKDEALARLCQVKDEQAAQGVAKQVHKPAQPVAVSREAVTQALAALYAGQSKIAPEAIDPLEPLESYGIDSIVIAGMNRELGERFASLSKTLFFEHRTLDSLAGFLWREREPACRAWLLASGTTVRTVAASPASRIEVAASAAAASAISDEPIPGDRAGTGAPETFGPANGMAVDASAWPPIAIVGLAGRYPQAADLDAFWRNLREGRDCIVEVPVERWPLDGFYEPDPDRAIASGRSYGKWGGFLEGFADFDALFFSISPLEAMGMDPQERLFLQSAWHALEDAGYTRESLAERCGGRVGVFAGVTRNGFGLLGLEAWHAGGMVFSQPAFSSIPNRVSYALDLRGPSLPVDTMCSSSLTAIHEACAALHRGDCVMALAGGVNLCVHPASYVGLATGRMLSRDGRCRSFGAGGDGYVPGEGVGVVMLKPLAEAQASGDRIHGVIRATSVNHGGRTNGFTVPNPTAQAELIAESLRKSGIHPRAIGYVEAHGTGTALGDPIEVSGLVQAFAPYTRERGFCALGSAKSNLGHLEAAAGIAGLTKVLLQMRHGELAPSLHAAQLNPNIDFEGTPFVVQRELAPWAEPELDLDGQLRRYPRIASVSSFGAGGANAHLLVEQYLAPAVPSVSSAGPQAIVLSARTPERLCAAVQALLDHVESGGGTAAGLAANLSAWLVEELAAIVGVEATAIDPHETLANLGVEILHRTRWYERVQERLNLPWSLKNFLDQDSVQQLGNTLLREQGATVVAQFHAPVAAPVLADLAYTLQIGREAMPERLAFVAADLAELATGLRGFLDGASRLPLWHGKAARGRALPARVDQAQWQAWIDARDWSQLLPAWVAGHELPWRDMPGAPGARRIGLPLYPFAAERYWVDPQSLRPRPTASLESRLHPLVRKRVDSVEGPAFLSRFSGAESFFSDHRVGGRSILPGVAYLEMARAAASLAADGATIRSLRNVVWARPIEAGADGVAVTLRLQPHQQGSWRYEVLGADDGVHGQGLAELALPEAPPVDLDLAALRARMQGGALANEALYQAYAAMGIAYGAAHRGFVQALVGESELLAELCLPSAVQADAQAYVLHPSLMDAAFQATLGLYLLSKRADAAKAMLPFALETLELHWAPPARVWAWIRSRGERSGIEKFDIELCDEQGRICVRMLGFSSRVLEAPAVSPEVPPAVLEAPSLLLSRYAWQDAPARRAAPDPALTRRLLLVSIAPQPVVWRQLGQGELLQAAAVQPEQACASLYVQIFERVQAWLEEKGRDTCLWQLAISGQGAELLLAGLSGLLRSASQESRRLLGQLMILEGDEDLASLRARLDENAASPFDSLVRYRAGRRETWHLLELPSNDGEAEAAPLPWRQNGVYLLSGGAGELGLLFVEEIARRATGATLVLTGRSALPDARRARLDALCEQGAQYRYEPVDVTDRQAVTQLVEYVVAEYGRLDGVLHIAGVLRDSYILKKDRAAFEQVLAPKLLGTANLDHATRTLDLDFFLMFSSSAAIFGNLGQTDYAAANGFMDAYAAYRQARGGRGRSLSVNWPLWRDGGMGMEAATEEMMLANTGMVAMRTPSGFSALARALHSDLPQVAVMEGLVERMRQKLLVPSAPSMQAVPVASASAAIAPSAQTDHHAEIVARVARGLRQMVAELLKLELDQIDIEDDLSDYGFDSITFTSFSNRINKQFGLELIPTIFFEYPDIAGLAGHLAEAHGAALGASLGLLAASAQGDRAQTRSAMSAEAVATANVSAEMPVPLAPQLSDQSAAASNTPVARRGVAVIGISGSFPGADGVDALWQLLERGGDAICEVPASRWDWRRCLPPGESEAVQARVRWGGFIDGVDRFDPLFFGISPREAELMDPQQRLLLSYAWLAVEDAGYAPQSLGGTDTGLFVGTAVGSYGSLVVQAGRSRDAYSSTSSVASIGPSRVSYFLDWHGPSEPIETACSSSLVAVHRAVQAIESGRCEAVLAGGVNTISTPEAHIAFSKAGMLSVDGRCQTFSAKANGYVRGEGVGMLFLKDLVAAERDGDTIHAVIVGSAENHGGRATSLTAPNPKAQAALLKAAYAKAGFDPRLLGYIEVHGTGTELGDPIEVNALKSAFKDLYQRAGVEPPGQPHCGLGSIKTNIGHLELAAGVAGIIKVLLQLRHRTLVRSLHGEQVNPYVQLEGSPFYLVQENLPWDAPRDAQGREQPRRAGVSSFGFGGVNAHVVLEEYVAPPARATAPAACPVLVPLSARNETRLREAAARLADFAAAHADDAALDLHDLAYTLQVGRDAMEARLGLMVSDKAELARCLRAWLDDAGAGEVFQAAPGKAQKEALALFAGDEELAGVVEGWWRNGKQAKLLDLWVKGLDLDWARLRAGAGRRRISLPGYPFANERYWLKPVSAETAALGLVASTPIETDEAAVLFFEENWHPYPIREALIASSTRTLLCCLSDATHRKALREAVFRYDPKWHLVFLDRQAPEPFDRQGWTDALCLLEQGGPVIDAVLYLRPLEEAGLRLAEAAPLGLVQALGSMKTRPARLVLGGEYADESERSQLEAWIGLERSIGLALPGCRAVTVLREAGDTIDWVAWTQLLRTALGEAAPRNLLADRDSLRHLQVQPLEPRAAAVADLGTTVLITGGTGGLGLILARHLAVGRRCNLVLVGRSPFDVVRQAAVQALQAAGSEVLYLSADVADAVAMREVVAQARARFGSIDSVIHAAGIQHAVPLADKQSEDMRRVLDPKVRGALVLDQVLAGEPLRLVCYFSSSSSVLGDFGSADYALANRFLSAHALARERRRARGERAGRSLSIEWPLWREGGMGVGDDAGTALYLKSSGQRLLEQTEGLAAFERLLASGATRALVLVGERERLHRMLGLAQVPSAPATQAMAVLMPSQTQTFSTASLEEQVSAELSVLIGDQVKLAPELLDAESNLADFGLDSFGLAELARALSARYDIEVAPSIFFAYSSIARLVGYLLDKHRAEVQAHRHRTATRVDVAAIAPQPASLAPPAAISMSTPAPPQLPVANAIEPAPTVPAFDGEREPIAIIGISGRFPKARDVDQMWRILAEGIDAVDEIPVERFDWRDYYCGLEAQPGRTNSKWAGCLDGVDEFDPLFFEISPREALAMDPRQRLLLQESWNALEDAGYGPHQLRAGPVGIFVGVEEGDYQRVVPDPGVTSNHNGILASRLAYFLDLNGPVLAINTACSSGLVALHQACASLLSGESDTAIAAGANLILTPEPYIGMSQAGMLSPDGRCRAFDRSANGMVPGEAVAVVVLKRWSRAVADGDPIRGLIRASGINHDGRSNGITAPNALAQASLVRQVQRAAGVLPEQIDYVVTHGTGTRLGDPVEIQALVEAFGQPVDGRAYCALTSSKGNFGHTFAASGLLSLIGLVKALEYDTIPPSLYCDQDSDYIAWRDSAFRVNKQARSWPRASGRARLGAVSAFGMSGTNAHVLVQEAPVAVARVAVAQTDVVLALSGKTEAALRERLFGLRDWLASAAAERCELASVSRTLLDGRHHFAHRAAVVVANRASAIAALEHLATLDTADGPDYYLGKAPRGFKGEVALRERAANWPALPSVDAAAYRERLGELARLHCQGYTVAWSALDGLQPAARVHLPGYPFARESYWPKTRISPPVATSVPASPAFPSSHSTPVVISLRPMLRTELSDQARGHARACYVARFSGEEFFLRDHRVRGQAVLPGVAYLELARAALEASSGRPVPSGLQLRHVTWVQPLMVDEPGVEAYIDLHRQDNGEWRYELASGAHDESERYLHGQGFLALVAQAEPTALDLSVLHGNCRVTEFDSAACYAAYQAVGIEYGPSFRAVQRIWVGEGQALVQLRLPVEALSDSGAYTLHPSLLDGALQASIGLAMAQATQGGEPMLSLPFALDSLVLHWPCPNETWVWIRPTPGAQSSRVRKLDLDLCDAQGRVCVALRGFSARLVAQGGTAQPALIPARVEAERVSMAAPINGLASASLPAKAKGVVPILAPAAKATGASTALPATTEGNASSAIPVVKTGAAPGAWLPVGLTMLAPLWTVRRVDDGSEPPAPVHMLLIGGDATQRAIWRQAYPQLRAIDVAPSTTIDELRGQLAATGVIDELVWIAPAQHSQDPTDEALLTAQASGTLALFRLVKALLAEDYASRRLAVTVLTRATQQVHPQDLVAPAHATVHGLAGSLAKEYPHWSVRLIDLDGQNADPPPERCRALPVGSESWAWRRGEWHKPELIALDEPTRGKVPAPYREGGIYVVIGGAGGLGEVWTRHLIEHYRAQVVWIGRRAEDAGLRARLAALAAHGSAPVYLQADAGNRLALSRARETILQRHGRIDGVVHSALVLQDRSLARMDETTLQSALRPKLDVSLRIAQVFADAALDFVLFFSSMMSFSRAAGQGNYAAGCTFKDALALALARRWPGAVKVMNWGYWGSVGVVADARYQERMSRAGIGSIEAEEAMVVLERLLGGPDAQLGLIKLSRAQAVEGVRDDLRGARYGAALPALLPQLAARPLPPEHASRLAAAQAALPPQAMQALSLRLLGQALLGFSLDGRHLLPGGAAGLALAGHYRRWYDTSLRLLDAGGWLQSLPNGDYQILATAGQQDAWPEWEAARAAWLANPQQQAWAQLLEVCLRALPELLTGQRKATDVMFPNSSLRLVEGIYRGNPIADLHNHILFDALEAYVLERLAREPGTRLRLLEIGAGTGGTSAGLLQRLDRYAANIDEYCYTDLSKAFLLHAEQHYAPGRPFLRTKRFNVEEPPQAQGIAADSYDIVVAANVLHATVNIRRTLRHAKVPLRAGGLLALNELGELSLLTHLSFGLLDGWWLYEDPALRLEGSPGLSSEGWERVLAEEGYAPLWRPAEECNRYGQQVLLAQSDGRVKRDVAVPPEMAAAPVEEVSAPASAFTSAQVADSTPAATFAPVTAPIPVPDSRDLEQAVADHVRTLLRECIGKGLDLDPRRIEADRSFSEYGVDSILAVQLVNEINQRLGIVLQTTVLFDYSHLDVLAEYLEQTHQAALRASLPEVSEVPALQAASTLAPKIQAQPSGVAFPLISPTQPIGATLPFVPPSVTGSHRRALISGPGQIQDLRLVAMEVPASLQPRQVRVAVFASSLNFSDLLCVMGLYPNMPAYPFTPGIEASGLVLEVGSAVSTLCPGDEVVCLAQGCHATEIVCHETQAWAKSPQLSFEQACALPVVALTMIDAFHKADLQPGECILIQTAAGGTGLIAMQLARHYGATILATAGSQEKLDYLRDQGAQHLINYREQDFEAEVARITGGRGVDVVINTLSGEAIDKGLRSLSPGGRYIEIAMMALKSAQAVDLSVLDSNQSFFSIDLARLIAERPEKLEQYRRELASLVEQGVLLPTMSRVFALDQLHDAYRYLQDRRNIGKVVLQVPQAVPLADQASAARAVDAVAGVHKAQPVPVNYADEPIAVIGMSGRFAHSPDLDSFWSHLAKGHDLVDPVLRWDLSPSGGRCRDGSFLDEIDRFDPLFFRMSGLEATYMDPQQRLFLEEAWHTLEDAGYAGEAVKGKLCGVYVGCTRGDYAQLCKSAPPQAFWGNSGALIPARIAYYLDLQGPAVAVDTACSSSLVAVHLACQGLRSGDTELALAGGVFVQSTPGFYLAANPAGMLSATGRCHAFDESADGFVPGEGVGAILLKRLSDAIADGDHIHGVIRGGAINQDGRSNGITAPSARSQERLERQVYDRYAVHPETLQMIEAHGTGTQLGDPIEYRALRQAFGHYTQRVGFCALGSVKTNIGHLANAAGIAGILKILLALRHRQLPPSLHFRKGNPAIDFEGSPFYVNTELRLWPAGERAPRRGAVSSFGFSGTNAHLVIEEAPAVPLVARATRRELELVVLSARTAGQLREQAARLLAHCQAQPQTSLGDLAYTLLCGREHRGYRLAAVVRDLAELCEVLSAWLEQGDDSRLQLGALDESGVREQLQQRRLGQVAIETVRAGQLEKLSSVAELFAQGYKLDYAGLFGSGYRRLALPTYPFAQGRYWVDDSLQHAVVPSTPATAPVVPTPVVPAPAVTQAEARQAPSRISTVPDQVMREASVAYLKQLVATTLRVSPTEISAHEPLERYGIDSILVVQLTDSLRQHFDSVGSTLLFEVQTIDALAERLLATEAPALARQLGMDAVAPLEATGSAIEAELPESPPPQPETNSQVQPAPAVVTVAETVGASAAAESSQPKAHGDVAVIGMSGRYPKAIDLNEFWWNLRAGRDCIDEVPAQRWDWRKHFDAQRGLHGRSYSRWGGFIDGVDQFDPRFFRIPPSEAEHIDPQERLFLQTAWLAIEDAGYTPSTLSAKRRVGVFVGAANSTYTLLPSHWSIANRVSFALDFHGPSLAVNSACSSSLTALHLALDSLAHGSSEVAVVGGVSLVLHPMHFNRLSSLGMLSSDAHSRPLGEHADGFVDGEGVGALLLKPLQRAIEDGDSIHGVIKGSMVNAAGKTRSFAVPDAAAQARLVREAQARAGVEADTIGYLEAHSNGGELGDITEMQGLAEAFAGTAERGHRCAIGSVKSNIGYCESAAGIAGLTKVLLQLRYGELVPTLHARCANPRIDFAGTPFALQQELSAWPRPANHPRRAGVSAFGVGGAYAHVIVEEYVAPVETQPEATGRALPIVLSAANAERLRVLAKRLAGFLGSEAGRRTALTDLAYTLQVGREPLAERLGFIAESVEQVREVLLAVAEDREVPLPLVRASLDRGRAGWAMFAEDEDFKRTVEQWIAREKHASLLDLWCRGYPLDWRHLYAAHRPRRIGLLPGYPFAEESYWAPESLRYAGVLEDADAFDATPFEPDQPAGEQS", ], "RyDIaUZc_b21_kQalx7J3yNO4l5f-439": [ - "RyDIaUZc_b21_kQalx7J3yNO4l5f-439", - "RyDIaUZc_b21_kQalx7J3yNO4l5f-439", + "AXA20093.1", + "AXA20093.1", set(), + "MKPNLNQDFDATPSSHAADRRPQAGAGGCVRAPGHVDTAIIGISARYPKAVDWRQFWENLRAGRDCIVEIPPERWDWRAYHDSARGTPGRSYTRWGGFLDGIDRFDPRFFRIAPSEAEHLDPQERLFLESAYLLIEDAGYTPASLSASRRVAVFVGAMNSSYSLLASQWTLANRVSHVFDFHGPSLVVNSACSSSLSAIHLAIESLATGTSEVAIAGGVNLIMHPAHYARLASVGMLSAGSHCRAFGAGADGFVDGEGVGSVLLKPLQRAIEDGDLIYGVIKGSALNAGGRTHGYTVPSPVAQGRLVAEAIERAGFAPHSIGCVEAHGTGTELGDPIEVRGLAEAFGAPVGAAPWCALGSVKSNIGHGEGVAGIAGLTKLLLQMRHGQLAPSLHADTLNPRIDFNGTPFSVQRKLAPWPRPAGHPRRAGVSSFGAGGANAHLLVEEYVAPIVPAPTDADSPALIVLSAANLDRLRAVAQRLLDFLNGEFSSGITLAELAYTLQVGREALAERLGFVADSLGQVRACLAAFLEDREAGRPLLRSSVGQGRAVGSGMLDDESFAQTLRGWIKRGKHELLLKLWGQGEPLDWSLLYRGARPRRVSLPGYPFAGERYWAPAAVRYAGVVCSRRRPAIDPDLLLCQPTWRAASLPAATGRALPHRELWLLGSQARLDDAALPALPIERFRSEQAEPVARCVDLYGQFHARLRARLRDKLSEPLLLQVAILGRDDELLLSGLSSMLRSLGQESRKLSGQLLVMEGGEDRATWQARLDENAVRAHEDWVRYRQGRRETWALQELPPATVEPALPWRARGVYLLSGGAGGLGLLFAEEITRRAEGATVILASRSAPVETRRARLAALAEQGLAIRHAVLDITDAAAVQALVDEIVASYGRLDGVLHLAGVLRDAYLVDQERDRVDQVLAPKLLGALHLDLATRMLPLDCFVLFSSAAALFGNAGQADYASANGFLDAFAVYRQTQGRSGRSLSVNWPLWAEGGMSMDKATEQLLTAGTGIRPMRAATGCRALAHALAGAFPQVLALEGEPVHMRAALLGQSSPVAVATAGDATSPKTSLSCQVRELVAALLKVEPEQIEAGQDIGDYGLDSIGFTHLANQLNLRFGSALRSTDFMELEMASVERIARLLEQKLPALSTGTTPVTRRDRGPSAVPVTPTPRDAGPAAHSDEPLRAQVREAVAALLKVEIEEVDLDLDISDYGVDSIGFTHLANRLNEQQGTRLRATDLLELEQVSVIRIARLLREDPGSRTLLDAVGADASLAVVEGR", ], "DTee9G4M8sEfnM4HaPfI37rT74pq7M_G": [ - "DTee9G4M8sEfnM4HaPfI37rT74pq7M_G", - "DTee9G4M8sEfnM4HaPfI37rT74pq7M_G", + "AXA20094.1", + "AXA20094.1", set(), + "MSVYMFPGQGSQAIGMGTDLFVSFPELTEAADRILGYSIRELCLEDPKHQLGQTRYTQPALYVVNALSYRRRLHEYGAPAYVLGHSLGEYNALEAAGVIGFEDGLRLVRKRGELMSEAPPGAMAAVIGPDEVAISALLARHGLDAIDIANLNSPSQTIISGLKEDIARAAPLFDAEQAHFVPLNTSGAFHSRYMTVARQAFVAYLGEFHFNRPRIPVISNVEAQPYVLERTAELLAAQITQPVQWTRSVHYLLALGQSEFLELGPGQVLTRLLVEIRKHTPTVAPATAGSLSSTPAYDRERELSELQQRIDDWNGRYPVGTRVQVERYPQQLVTRTPAMSLFGHRAAIYLEGYNGYFDLADVHPLHGASA", ], "mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn": [ - "mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn", - "mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn", + "AXA20095.1", + "AXA20095.1", set(), + "MNAIPKDYAVPSISIERLGSVAFKQDHGLRYPYVAGSMVKGIASTAMVINMGRAGFLGYFGTGALDAVSIERAILEIQAALGDRQPYGMNLLSNASTPQAEMDTVDLFLKHGVRRVEASAYMQITVPLVKYRASGLRRDAQGAVVARNMILAKLSRPEVAALFLSPPPDKLLAELVAGGAISTAEADLARLLPMADDICVEADSGGHTDMGVLSALLPSIVRLRDELVAHHGYARTVRVGGAGGIGTPEAAATAFILGADFILTGSINQCTVEAGTSEAAKDLLQQVNVQDMDYCPSGSLFELGAKTQVLKKGVFFPARANKLYELWKNHSSWEEIDAKTREQIQNKYFMRSFESVYEETRAYFLRAEPGEIEKAEKTPKHKLALVFRWYFVHTMRLAMSGSSQQKVDYQIHCGPAMGAFNQWVKGTELESWRNRRVAEIAHRLLEETVQLLNRSFLAMSS", ], "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh": [ - "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh", - "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh", + "AXA20096.1", + "AXA20096.1", set(), + "MLELTKRLADALVSISLFAACRESGLGALLKDRAGLPTRAAQLTWLAPQCGIDEARLEAALQALRDAGWIEALEDGRLIPRATFERVEPWSEAVAVGLDRDWGALLREQDGRRLRHWLEQGAAARESLAGCQAEAEALDAAAMAPLLFELARLDDAAWLQGRDVTSLAPANAALLRADFLRRGWSLDEAAGLIPNVQGLAMLRDAAALGPLLFLARRPEAGARTLASTVALYRANLRWRNALDQAIAVADSSAHEAWPTGACVMAAAAIGCLPERPSPSRPAVKPGPARFALHQWTARPYRVRHPSLDDLAILRELDLASWPVGMAVPENELRRRIEQFPQGQLLIEQDGEVIASLYAQRIDTLDQLRHTPYARFAWIHRPRGALAHLMGICVAPDWQGHGLADQLIDFCLVYLASFEGIDSVAAVTRCHEYGRFGDKVTLDDYIRQRDEEGRYREPMLQFHASHGAIIHEVVPGFRPEDQANHGTGVLVEYAHYRQAPELAGPVVAVDSVGPGSAALDVAEAVRASILDVLGELHAAAYGPQVPLMEMGFSSFHLQELQRSLGERVGLKLDATFFFQHGTPAGIVEHLRERLAPIGTEQADIRSTVDTETTPSSATTDGIDVPERIAVIGVACRFPGGVGNPEQFWTLLENGVDAIGEREPGVSPTAASTRRGGFISAVDRFDAGFFRISPREAELVDPQHRLLLEVVWEALEQASIAPGRLAGSDTGVFVGVMGHDYERLLRQQGGAPPIDPYFATGNANSIAAGRIAYYYDWHGPTLAVDTACSSSLVATHLACESLLAGECSLALAGGVNLLLHEDMFAAFEQAGMLSPEARCKTFDASADGYVRGEGCAMLVLKRFSEAQRDGDPVWGVIRGSAINQDGASAGLTAPNQGAQQAVIEAALRKGGVVPHALRYLEAHGTGTRLGDPIEVLAACAALSAGRPIGQPLLLGSVKTNIGHLEAAAGMAGLIKVMLSMRHGLIPRHLHLQQPNPHLDWAALPVEVVSEARPWPVGPKLAGVSSFGFSGTNAHVVLEEYPANPANTVPMAARSSALLLLSAKREEVLQTQVRQLHEAIGALDEADLPDVAYTLQVGRDAMEYRLALAVGSLAELRQALARFLAGEAGIRQLWQGRAGQQGYLLGSFVLDEAFTASIATSLAAWFARGELGKLAELWVQGLDVDWRRLYGANPPRRISLPTYPFMKERHWLPQAVAQATAEASGAPLLHPLVHRNTSNLAEQRYSSRLDQQAFYLRDHVVQGRHVLPGVAQLEWARAALALALGDTSASLRLEQVSWVQALTVEQALEVHIGIEADEGGWLTYEIYRGSDDEVELYSLGRARLDAERKVPNLDLATLQARCTRRIDGPACYARFTRMGLGYGPAFQVLTELHVGADLAIGRLQVPVGIELGDYRWSPSLLDGALQASYGLVDETAGLQLPFAVESVEQSHALPESALVVVQRAADDSGVLRKLDISIVEESGRVALRLTGFSTRAVQAAAPADSLLMVPRWQARSAVEAPPEPGYRTHRVVLCEFEALRGGFDAALPAASVVHWQAPGSLAERYARYAGQLLCELQALAADHPADPVLLQLVVPAQGEAAVLQGLVGLLCTAQQEYPWLHSQVIALPADAPVADCLAREAAAPVPRVRYQGMQASTREVMDFVEVPPTETAMPWREDGVYWITGGLGGLGLLFAAHIARQVQTPVLVLSGRREPDTAGQAQLDALRALGANVEYHALDIADAAAVAALARNIIARHGCLNGVIHGAGVLRDGLLHSKTVDELQEVLAPKVTGMMALDHATADLELDWLLLCSSMTTVIGNTGQGDYGAANAYLDAYAVHHEQLVAQGLRRGRAISVSWPLWAEGGMRIDAEGQAYLRRSTGMQSLPSEIGLAALEQLLATPRAHSLLLYGDRTRLLARVQALYRAPEPVVVRSVLALAPAAGPVDSQEALRKTARRYLTRLLSRSLKLPPQRIDVQTPLEQYGINSILVVSLTRDLEASFGRLPATLFFEYQSIAALTEYFLAHHADSLTMLGAAPAATQLMAVTSSAPAREASIDAPALRRRRHRRSLPGSMVAGPPVSTVGAPLDIAIVGLSGRYPQARSVADYWANLLKGIDCVTEIPAERWDWRQHFDARKGQDGKSYSKWGGFIDGMDAFDPMFFGIAPREAQLMDPQERLFLQCAYHAIEDAGYTRAGLAASATEGERRGQVGVFVGVMYEEYQLYASQAQARGQGLSLFGSASSIANRVSYHCNFHGPSLAVDTMCSSSLTAIHLACQSLRQGGCSVAIAGGVNVSVHPNKYLMLSDRQFMASNGRCTSFGEGGDGYVPAEGVGAVVLKRLEKAIADGDHIHGVIKGSALNHGGKTNGYTVPNPVAQGQVISQALAESGVPARAISYVEAHGTGTLLGDPIEIAGLSQAYGASTQDKAYCAIGSAKSNIGHAESAAGMAGLTKVLLQMQHGQLVKSLHSDTLNPHIDFSQTPFVVQRELGPWTRPVLEVDGGGEHEYPRIAGLSSFGAGGANAHLIVSEAPASSRHEAVPRQGPVLVVLSARNENILRCQAEQLLTHVQSHASDLANLAYTLQVGREAMEHRLAIVAASTEQLSARLNAYLQDDTLDEAVYRGEPRRSQEAMAVFGGDEELQEVVAKWIARGKLEKLAELWVQGLLIHWEQLYGQAMPRRISLPTYPFAQERYWIDAGAATMRVAGAQILHPLVHVNSSDLQGQRYTTVLDAGTGLLRGHRLHSRPTLPALAQLEWARAALAHALGGTAGLCLEEMRWLVPLHVDAPTTLHIALDWEDETHVGYEIYREDDEGREVYAEGRAELVDALPAPRLDLPALQAQCTQHLDGDEAYSRLATAGWSCADSFQALSSLQSGEGLAIAHWRQKVDASWQDYALAPNLLDVALQACRLAWPQQDWSWPTAARQLRMVGSLPVQGMVVVRQHPGQLDVDIDFADGEGYVLASLQGLAPQQPSTQASPVAQTLLLAPCWTPQAGPPAACERPSYAAHWVVLCELDAPASLEAELVPAHCLRWQAEGNPAERYGVYAGQALAWLQKIVAGGPSGQVLLQLVLPARGEAALMQGLGGMLRSARLEYPWLLIQVIAVDSAQELAVRLNTEAIAPVPALRDGAAGREVLDFIPLAPPEGVRAMPLAWRDEGVYWITGGLGGLGRLFATAIAAEVRCPVLVLSARRPPDTAQAAFIERLREQGARVEFRAMDTGDAAAVEAVARAIVAEHDGLNGIIHSAGVLRDGLLANKREADLRQVLNAKVGGLFALDMATRDIGLDWFLLCSSVSSVLGNAGQTDYAAANGFMDAYSAYRQELVEQGRRRGRCVSLSWPLWAEGGMHIDAVAQEQMRRATGMQALPRAAGLAALHQALAATVSHVLVLHGEPQRLRDYVSTAYRVPALAEPVAKMSPGRGYRRELKGLDLADCVNWDLVEHTSALLQMPRDAVDTQANLIDYGYDSVSLTAFAARLGEHYGIALTPSLFFSHPTLEQFSVYLLDSHGDALAAFYRAASEVERAEAGPAQLGAPTAATSASIRRRRHAQLAGAVANIVEPIAIIGISGRFPGARSVDELWTILRDGREVLQSAGTERFAAWPPPQRPACDRIGLLPGVAEFDPLFFEISPREAEAMDPRQRLLLQEAWRALEDAGYGVTQLQLHTVGMFVGVEQGDYQLIGKTEADVTSNHDGVLATRLAYALNLHGPAMAINTACSSGLVAAHQACLSLRVGECDTAIAAGVNLLLTPAIVRSMEQAGMLSPEGRCHAFDRRANGMVPGEAVVALVLKRLSRAEADGDPIHAVIVGSGINYDGKTNGITAPSGAAQTRLLQSVYARHHIDPADIDYIVTHGTGTPLGDPVEINALADAFTPHERAPQSCALTSSKTNLGHTQAASGLVSLVGLVQAIRHETIPASLHCEQLSDHIAWQKSPFYVNTAARPWPAPTTRARLGAVSAFGISGTNAHMVLRGHAAPADAGRHAAVRPLLLAVSAKTAEALRQRVQDLIERLQAREHDAAELASISHTLLVGRQHFAHRCAVVVQDREDAVYALQQALSRETRANLFRGVVSREFAAQKALLDYGQELIGRIAGVQQTTKQVQQQAGAQGEASREALSALADMYCQGYALAWHELFGQTPPNRVHLPTYPFARETYWVKPTKHAEAGEAVQLHPLVHRNTSDLDEQRYSSRLVVDAFFLRDHVVRGCSVLPGVAQLEWARAAVALALGGEPSIRLGQVDWLQPLVVEQAAECHIALAPLDDGRLAFEIYGDNGQVHSQGWAEAVSPGQVPRIDLAGLRARCTYRLTGEQCYARFVRMGLNYGPSFQSLAGLRRGEGIAIGELRWPADVDQEAAFVLPPSLLDGALQSCIGLYAESTGLILPFAVETVEQWGAVPATAYAVVQPGADDNEAVRKLDIRIVDEQGQVAVCLSGLSLRSVAPASTAVGTLMLAPRWRVQPALATNMVPANVAHCVIFCEVAPVDLRETLPTASSMHWTAEGSLDERYTRYAEKLCIELQTLEASRSDRLWLQLVVPAQGEHAVLQGLDGLLRTAGQEYPWLVAQTIAVKDTSNLAARLAAEASSPAPRLRYGEAGREILDYAEVFEPRQGVRPWRDRGVYWITGGLGGLGRLFAAHIARQAQVPVLVLSGRREPDAAGQAQLDALRALGAHVEYHALDITDVAVVAALAQNIVGRHGCLNGVIHGAGVLRDGLLRGKTVDALQQVLAPKVAGMMALDQATATIELDWLLLCASAAGVLGNVGQGDYAAANAYLDAYAVYRDELVAQGHRHGRAISVSWPLWAEGGMQVDAAMQAHLQRSTGMQALPSEAGLAALDQALSESGAGQVLVLHGERARLLGHVQAVHTALALEAQEETATLVAQEAGMDFREAAQRFLTRLFSQSLRLPPQRIDAKVPLEQYGINSILVISLTRDLEASFGRLPATLFFEYQSIAALTGYFLEHHGAALHALLGWSKAEAGAVSTLPTPSPSPAPVNAAPSLPARRFSGRLLDRFSRRNALTSGAPPDTPPVPLDIAIVGLSGRYPQARSVADYWANLLQGIDCVTEIPDERWDWRGQYDPAKGKLGKIYSKWGGFIDEVDAFDPLFFNISPREAELIDPQERLFLQCAYHAIEDAGYTRAGLAASATSGERRGQVGVFVGVMYEEYQLYGAQAQAQGQALSLFGSASSIANRVSYHCNFHGPSLAVDTMCSSSLTAIHLACQSLRQAGCAVAIAGGVNVSVHPNKYLLLSDRQFMASNGRCTSFGEGGDGYVPAEGVGAVVLKPLEKALADGDHIHGVIKGSALNHGGKTNGYTVPNPGAQGQVISQALTEAGVPARAISYVEAHGTGTSLGDPIEIAGLSQAYGASTQDKAYCAIGSAKSNIGHAESAAGMAGLTKVLLQMQHGQLVKSLHSDTLNPHIDFSQTPFVVQRELGPWTRPVLKFDGGGEREYPRIAGLSSFGAGGANAHLIVSEAPASSQHEAVPRKGPVLVVLSARNENILRHQAEQLLAHVQSYAPNLENLAYTLQVGREAMEHRLAIVAASIEQLSARLNACLQEDTLAEAVYRGEPRRSQEAMAVFGDDEELQEAVAKWITRGKLEKLAELWVQGLSIHWDRLYGQALPRRISLPTYPFARDRYWVPKNLPSIDAASTQAAVLHPLVHRNTSHLGGLRFSTRLDPQSWLLREHQVQDHGLLPGAAQLEWARAALSLALEGAKVRLRQVTWLRPLLAEGEAELHIVLRVEDDGRISFRIYREQDGDTLVYSHGWAEALGEQTPAPTLDLAGLLEGCTRHWSREEGYARLEAMGLHYGKNFQVLMSWQIGDAAVVAELRAPDVERLAGYGLPPNLLDGALQASLGLAGEQVGLSLPFAVELVEQWGPVPSPAYVVVHRAVGDSAVVPKLDIDIVDAQGQVAVRLGQFSRRSVEALADAESSALVAAADAVREWTLAPAWDIADLDAHQNANQDGFQKQGTLVLGEADWIGSSGLRNLDWEPEASRECLAERLGEQGELQQLIWAVPSAEPHAALMGLRLVQALLALDYGTRSLQLVVVTRQAQAVWPQETADPRQASVHGLVGSLAKEYPHWRISLIDLPAQITQDGHQWLAQAAQAADSRGDARAWRDCRWYRQQLVPCRLPAVQASAYRQGGVYVILGGAGGVGVAFSEYLVRHYQAQVVWLGRRAEDAVIAEQRARLGGFGPAPWYLRADATDRSALERAYARIRQRHGTIHGLVHAAIVLADRSLAGMPEAVFAAALDAKAATTENFDAVFGTEPLDFQLFFSSLQSYTKSAGQSNYAAGCCHADAYAHGLRQRRPYPVKLMNWGYWGSIGIVSAEGYRERMAQAGVASIEPPEAMIALERLVAAPLHQVAFVKTTTAQMPPLLAFDPQARIELAKASPALSLPAPVALPKPDAAYAEQAAMEARLARLLWAKLTAWGWHGAQAPGLVPAYALWHQASLRLLGEQAQPVVTAADEAILSAQWQAYVHALRDDKALGAHVRLADVALQALPAILRGERAATEVLFPQASLNLVEGVYRNNPVADYFNAVLGERLQAHVQARLAQDPRARLRILEIGAGTGGTSEGLFRCLAPHSERIAEYAYTDLSAAFLRHAEQQYASLAPYLHTQRLDIERSPLAQGFEAGSYDLVVATNVLHATRDIRRTLRHAKSLLRAEGRLLLNEIEGTSLFAHLTFGLLDGWWLARDPALRIPGTPALSWNSWREVLMGEGFRPVLAPAFEAHRFGQQIVEAGSDGVIRVQAEVANAPVVASEVAVAAPSASRPATMPTRRSHAAVNAAPVAAVATATGGARGRQARVRQAIRESVLEALKMNAAQLQDDQAFMTYGVDSITGVALVNTINTRLGLRLPTTTLFDYSTIEQLSTHISMQYAVQLSDAELEPVAAVVSAPVMTEMAASPLSESALDTVPMPSLAAEVFRAAAPVPVWRPSPAEPVQVQPRLLPIVAPGPSGSGPTYHRVWLDRPGSIDEVRIVADSLSPLQPHEVRIAVHAFSLNFGDLLCIKGMYPTQPAYPFTPGFEASGEVVAIGVGVSSVAVGDAVMAIAGAELGAHATVLTCMEQQVYAMPRGLSFEAACAMPVVAVTMIECFTKARLKAGESILIQTATGGTGLVAVQLAQHAGARIYATAGSAAKLNYLAGLGIEHRINYLEQDFEAELMRLTGGRGVDVVINTLGGDAVQKGLNCLAPEGRYIEIAMTALKSAHAIDLSGLANNQTLHSIDLRKLGRTNPAALERGVREMTRLLEAGVISPVLSRIFDFEQVQDAYRWLEDRRNIGKVVVSVPLTYRYQAPDSGERIAIEPIAVIGMSGRFARAGDLQELWQALAGGEDLIEEASRWPLDALGPDQEPYCHHGGFLRDIDAFDPMFFNISGHEAAVMDPQQRIFLEEAWRALEDAGYAGASVEGRRCGVYVGCAAGDYQRLLERDAPAQAFWGNAGSLIPARIAYHLDLQGPAVAIDTACSSSLVALHQACQALRHGEAELALAGGVFVQSTEHFYLQANRAGMLSPRGRCHTFDARADGFVPGEGAGVVVLKRLSQALADGDLIHGVIKGSGINQDGATNGITAPSARSQTRLEREVYQRHGIDPQQIQVVEAHGTGTVLGDPIEYRALTEALLDGKPTGELGTRCAIGSIKSNLGHTAAAAGIAGVIKLLLALRHRRIPPSLHFEQGNTHIDFSRSPLYVPTTLEDWPASVGGQRLAAVSSFGFSGTNAHAVIGEAPATNRVLPRRPTYLVVLSARTQPQLRLQLERILAHLDGEVPPLASLSHTLLLGRRHFDCRWACLASDLKGLRAQLAEALQKETVGGRIGGADHVPPGEEQLDPLQRRMSDYTDMISHEAARDLLEALREAYLQGLPLDWSFLFQGDGWQRVGLPGYPFARERYWVPVRKTVANTEAALASEPLGQAGMSQGDAGETRTLTWVPQWRSRALVSDAQTAAADRHVVLCELDADPALVEGAVTVRQWTHEGSLDQRYAHYAEYLLTEIQTLAKHRSRRSVLLQLVVPARSEGAVLQALGGLLRTAEKEYSWLRTQLIAVDDVAHLSERLNEEATADPTSRVRYLGSSREVLSYVPAVPACGSAPVWREGGVYWITGGLGGLGLVFAEGIARQVRCPTLVLSGRRAPAPTQQARLERLGELGATVECHALDVSEAVAVAALAQSIVARHGRLSGVIHAAGVLRDGLLHNKRREDLQAVLAPKVAGLLTLERATAGLSLDWLLLCSSVAGVLGNLGQGDYAAANAFLDAYALYRQAPYDRVTRRTRLYSISWPLWEEGGMRIDTDTQAALWREAGVKAMPSEAGLQALNCVLTQDFAHALVMHGDARRLSQVVEGAMPEAVDEEKAATLQSNPADLKAKLEAELAGMIASHLQLPIEALSRDARISEFGFDSISLKAFLKLLNRRHGLALSPAVFFEEPSIRALAAYLLREHGEAFVAIESSAQATLEAPPEPPPVAATSSSTEREQQGGAKPTQREAIAVIGLDGYFPASTDLQEYWDNLWTGRDCITEVPARRWSLDEFYTEDVEVAIRQGRSYSKWGGFIRDIEALDPGFLSGVPAKARQQLNEEQKLFVGIVDRLLVTSGYTEQRLEALRCRRVGVYLGMTAERSAPTDATTSGRNDSPGTLAGMVSRMFRFNGPSVAVDAHSASSMTAVHMACNNLLHSECDAAIAGGVSLLYPDTYRDGCQISLLASHPESRSFSEDKDGVLLADGVGAVLLKRLSTAVEDGDRILAVIRSTVAQSVSSGLSDLPKPELVAASIRENFARATVDPRTISYVEAASAGFPIGDVIEMSATALAFRAYTDQRQFCALGSVKANIGHATAASGISQLAKVVLQLQNGRLAPSIKVGPEQVQAQLRKSPFYVQQQAEDWQRPRLSIDGNEEGREYPRRAMINSMGHGGFYAGAILEEYCGPVLEDQ", ], "9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6": [ - "9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6", - "9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6", + "AXA20097.1", + "AXA20097.1", set(), + "MSAEANKAIVTAMYEALNNRDAKGHFGHMADDVQVTYFGNHRFSRTFHGKQDLFENFTKHFMEYLEGPLDFRVGNIIATDDYAVIEGQGIGRTKDGQDYNNVYCIVMRLVEGKVTEIREYMDTDLAKRIFG", ], "MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm": [ - "MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm", - "MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm", + "AXA20098.1", + "AXA20098.1", set(), + "MKDLTQGAVTRHIVSMAVPIGVGAVFQSLYYLIDLYFVGCLGSDALAGVSAAGNLSLLVMALTQVLGAGTLALMAQAAGRKNEDQARGIFNQALVLSICSGLVLLLLGYALTGVYLRSTSATLVVAEQGQRYLYWFLPGMAFQFVLTAMASALRGIGVVKPTMMVQLMTVVLNIVLAPVLIVGWGTGYAMGVAGAGLASSIALAVAAPAMAWHFHQLGHYVQVRRELLRPRWADWRRLLTIGLPAGGEYTLLFLYSAATYWAIRDFGPAAQAGFGAAARLMQVLLLPALALSFATGPMVGQNLGAGMAERVRRTFGAAIWLISSIMLLACLLLLWEGETLLGRFATGGEVIEQGTHYLHIACWSFIAQGVIFTCSSVFQGLGNTRPALVSSVVRLLCFVLPVAWLSARGHFPIIAVWYLSLASIFLQALLSLWLVRQEFRLKLAPVAGLPSMQAIRR", ], "Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn": [ - "Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn", - "Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn", + "AXA20099.1", + "AXA20099.1", set(), + "MSSGYQPDAVQARGNPARGGTVFMFGGQGTQYFQMGRELYRSHPVFRERMDGCDALIRAELGYSLTEVLYEAGHTSSTPFDEILYTHPALFSIGHSLGEAIRADGIEPKAVLGYSLGEYIGLVAAGCLGWEGGLRLVIRQAQVLAQHGAAGGMLSVFAELEQFWQRPDLYTGSQLAVVNFSGSFCVAGAPEAMEAIKRRLDEEQRISTLLPIRFAFHSSGIDALEQRIRGLTADLRIKPGRVPAYSCMLRRAIGTAEFADPTAYCWRVVRDSVHFEQTARRLAAEIPVPFMVDLSATGTLATFIKYARIDGVSYSHCINQFGRDLNEYKSLMQRFAV", ], "nk3UJUyLaWr8LochtohJ9L6eugdChZL9": [ - "nk3UJUyLaWr8LochtohJ9L6eugdChZL9", - "nk3UJUyLaWr8LochtohJ9L6eugdChZL9", + "AXA20100.1", + "AXA20100.1", set(), + "MSSVATVPTTPSYTMRPLLHAVLFLRFRHARVPLIGQRIRLVRRGVMARLMQLPVTVVENRSKTVATTGGSLAENVVYRRFKTLTGNCFWGASHRLAGAQGHRSRRRNQPHGGPRSSIIRSYRLKLCP", ], "du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL": [ - "du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL", - "du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL", + "AXA20101.1", + "AXA20101.1", set(), + "MSAIPVHASPFLFELLTNALPDVYQSYTELRRLGGVVRDSSGVILLARHDDIVKTFADRRFTSVGRQSLASMSPALQEKMKGSRLFELLNFRDGDSHRQARRVLAGLFNPDTMEWIRVEIAAEAGVLMERWQTGEADDFVAAVTAQLPMRALAMLIGIEEVGLQDFFLRTRNFGAWLSASVFSEEGLSLVADEFVWARGWLRAQLVGKPLFDAVSDDTRENLLGDLLLVLVTGYDSSVALLGNGLATLVSVPALRDRLARDAGLSLRAAEELIRYDSPAQVAFRLALEDVTLGEHRIAKGEFVALVNGSGNRDETIYEHADQLNFDRPKQRALAFGSGPHACAGAALAKAQLTGFLDGVRPWLPHLTLDDATAPPSQHGLLRYRTHLMLRHTVV", ], "5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP": [ - "5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP", - "5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP", + "AXA20102.1", + "AXA20102.1", set(), + "MAYRSTIYLDWGTFFLIETPAASIQPLLPASVRPLLAASGAAILTLNVVHFLAGGEQVDLPANHEIDIGVLVELDNSEFEADLPQASVAIHVLKVASTARGYVDLCQNTGYRVIDPVALAFEINPATLTASVCDADGPILGCRQLDLDLEYDEFRRIGQDVMYDAERGIHRANYIFSGKGLSRPMTNAMELRVHAHPFFADLGIEPGVLVCLDQFALKPSSRSSLAFYRPNQVEMDETVQAEKD", ], "iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L": [ - "iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L", - "iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L", + "AXA20103.1", + "AXA20103.1", set(), + "MSTRMFITGGACGLGRALAQRYARDGASVCIGDVDDAAGRQVVEALLAEGATAHYLHCDVTREPQLHEAANWLRTQWGGVDIVINNAGVAQIGGITESSLEDWQWAVDINLLGVVRGCKAFIPLLQAQGGGKLLNVASIAGLLYMPKSGGYNATKAAVIALSETLQLELHDSGIQVSVACPAYFRTDLARNMRASNAQLRQRTHNLVEQARLGSQEVAELIHAGLARGDTHILTHPATRIAWRLKRWLPYRWYLGIARKQIAKAGTAAEDAPA", ], "WbViYzQw8y-XfCQMgQXkedGduNMJPa14": [ - "WbViYzQw8y-XfCQMgQXkedGduNMJPa14", - "WbViYzQw8y-XfCQMgQXkedGduNMJPa14", + "AXA20104.1", + "AXA20104.1", set(), + "MSFTLQGIPVSRGISIGRAYLIAPAALDVAHYLVEAQQLEAEIERLRAALLAVRGELDVLRSDLTEDTPSEVAAFIDVHSMILNDALLVQETIDLIRVRRYNAEWALTKQLDVISGHFDDIEDEYLRERKADIEQVVERVLKALAGEPSASQALGGATGGKDEMIVVAHDIAPTDMIQFKNQAFHAFVTNLGGHTSHTAIVARSLGIPALVGVQHASALIRQNDLIIVDGERGIVIVDPAPIVLEEYSYRQSEMALEQHKLQRLKFSPTQTLCGTQIDLMANIELPDDTKTAVEAGAVGVGLFRTEFLFMNDANALAEEEEQFRAYRRTVEMMNGKSVTIRTIDVGADKPREEGYETAPNPALGLRAIRWSLSAPQMFLTQVRAILRASSVGPVKILVPMLAHAQEIDQTLNLIREAKRQLDDAGLAYDPNVRVGAMIEIPAAAIALPLFLKRVDFLSIGTNDLIQYTLAIDRADNTVAHLYDPLHPAVLHLISGTLRAAKRAGVPVSVCGEMAGDPAMTRLLLGMGLTEFSMHPSQLLVVKQEILRAHLKAIEKSTADVLASYEPEDVQLALKQLAAAKPKADLAA", ], "4_8182J88axMDpFJBZI6kLNJAu8Ittm3": [ - "4_8182J88axMDpFJBZI6kLNJAu8Ittm3", - "4_8182J88axMDpFJBZI6kLNJAu8Ittm3", + "AXA20105.1", + "AXA20105.1", set(), + "MIVPMKNSDLSLPSKLDAAILRGRDALAQRQNADGSWRFELESDATITAEYILMMHFIGKIDDVRQARMARYLREIQRLAAHGGWDLYVDGAPDVSASVKAYFALKAAGDSEDAPHMARARETILHLGGAARCNVFTRILLVTFGQVPWRATPFMPIEFVLFPKWVPISIYKVAYWARTTMVPLLVLCSLKARAKNPRGISIRELFVTAPEEERNYFARGGFIRNLFLYIDRTLCSLDALIPKALRRRAIRHAESWCAERMNGEDGMGGIFPPIVYSYQMMEVLGYAEDHPLRRACEDALEKLLVERPDGSMYCQPCLSPVWDTAWATMALEQARAVPDTREQPTVSAAQLQVGITRACDWLAGKQVTELKGDWIENAPTETPAGGWAFQYENPYYPDVDDSAVVAAMLHQRGCAMARLTGTDPYTEVVSRGLDWIRGLQSRNGGFGAFDADCDRLYLNLIPFADHGALLDPPTEDVSGRVLLCFGVTGRAEDRSALARAIEYVKRTQREDGSWWGRWGTNYIYGTWSVLAGLALAGEHCSQPYIARAIDWLCARQNADGGWGETNDSYVDPSLAGTNGGESASNFTAWALLAQMAFGEWQSESVQRGIRYLLSVQQADGFWWHRSHNAPGFPRIYYLKYHGYTAYFPLWALARYRCLSQAHVATSASPAAEARRGVVL", ], "IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte": [ - "IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte", - "IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte", + "AXA20106.1", + "AXA20106.1", set(), + "MNFDDYCQKKAAPPGSSIYYALRQAPLTSQGALIALFALRRELEEATRETSEPAIGQTKLAWWRKELAALAEGQPSHPVSQALALHASAIASDHAVLQALADGFAMDLEQTRYLDWPNLRRYVERVGGGFAGLVARATTGPAIPADIAPAWAASLGEGLQLAQIVEDVGDDLRHGRVYLPFDELQRYEVTSVDLMHRRYSPAFRKLMRFQVTRARETLHAALEAIPAAELPAQRSLRALAALALATLDEIEGEDYQVLHQRILLTPIRKLWVAWRAAQRRY", ], "ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x": [ - "ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x", - "ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x", + "AXA20107.1", + "AXA20107.1", set(), + "MASMGIISGNTNGLRVSEVRHTVDDADANGDLGRLRGDVTRPETVAGEGLEPIHRILGKRSPVVATFLLPFSTTVTGNCINRAIMPRRTGHIRWPMNDTLAWRNRRNSTACSNGRMAWLGVVGTITANDIDLFFAWNLVEQLGQSITVSHILIRH", ], } assert {k: v.sequence for k, v in sg_object.proteins.items()} == PROTEIN_DICT diff --git a/tests/python/classes/test_mibig_gbk_parser.py b/tests/python/classes/test_mibig_gbk_parser.py index 15cac39b..5c1b7c9e 100644 --- a/tests/python/classes/test_mibig_gbk_parser.py +++ b/tests/python/classes/test_mibig_gbk_parser.py @@ -77,8 +77,8 @@ def test_loci_parent(): sg_object = SocialGene() gbk_path = os.path.join(FIXTURE_DIR, "lagriamide_mibig_bgc0001946.gbk") sg_object.parse(gbk_path) - assert isinstance(sg_object.assemblies["lagriamide_mibig_bgc0001946"].loci["BGC0001946.1"].parent_object, molbio.Assembly) - assert sg_object.assemblies["lagriamide_mibig_bgc0001946"].loci["BGC0001946.1"].parent_object.uid == "lagriamide_mibig_bgc0001946" + assert isinstance(sg_object.assemblies["lagriamide_mibig_bgc0001946"].loci["BGC0001946.1"].parent, molbio.Assembly) + assert sg_object.assemblies["lagriamide_mibig_bgc0001946"].loci["BGC0001946.1"].parent.uid == "lagriamide_mibig_bgc0001946" def test_loci_features_sorted_by_midpoint(): @@ -117,8 +117,11 @@ def test_parse_features(): ) # fmt: off temp = {f"{i.uid}_{i.start}": i.all_attributes() for i in sg_object.assemblies['lagriamide_mibig_bgc0001946'].loci['BGC0001946.1'].features} - assert str(type(temp['5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP_92083']['parent_object'])) == "" + assert str(type(temp['5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP_92083']['parent'])) == "" for k in temp.keys(): - del temp[k]['parent_object'] + assert isinstance(temp[k]['parent'], molbio.Locus) + del temp[k]['parent'] + assert isinstance(temp[k]['protein'], molbio.Protein) + del temp[k]['protein'] assert temp == {'-l7xLyFZbiZENPLq_GML8JyTRF1Srawr_2205': {'description': 'transposase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': '-l7xLyFZbiZENPLq_GML8JyTRF1Srawr', 'external_id': 'AXA20088.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm_87147': {'description': 'MATE family efflux transporter LgaH', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm', 'external_id': 'AXA20098.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, '5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP_92083': {'description': 'LgaM', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': '5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP', 'external_id': 'AXA20102.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn_58398': {'description': 'enoylreductase LgaF', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn', 'external_id': 'AXA20095.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl_1190': {'description': 'sigma-70 RpoE', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl', 'external_id': 'AXA20086.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn_1915': {'description': 'competence protein ComEC', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn', 'external_id': 'AXA20087.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte_98026': {'description': 'all-trans-phytoene synthase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte', 'external_id': 'AXA20106.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'WbViYzQw8y-XfCQMgQXkedGduNMJPa14_93904': {'description': 'phosphoenolpyruvate-protein phosphotransferase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'WbViYzQw8y-XfCQMgQXkedGduNMJPa14', 'external_id': 'AXA20104.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, '9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6_86717': {'description': 'nuclear transport factor 2 (NTF2)-like protein LgaL', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': '9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6', 'external_id': 'AXA20097.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn_88580': {'description': 'acylhydrolase LgaI', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn', 'external_id': 'AXA20099.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ_13790': {'description': 'hybrid trans-AT PKS/NRPS LgaB', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ', 'external_id': 'AXA20091.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0_2828': {'description': 'hypothetical protein', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0', 'external_id': 'AXA20089.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L_92859': {'description': 'ketoreductase LgaK', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L', 'external_id': 'AXA20103.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'AStsOnOU5ZWxURs9PrTiWjddkuQXfanl_3132': {'description': 'hybrid trans-AT PKS/NRPS LgaA', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'AStsOnOU5ZWxURs9PrTiWjddkuQXfanl', 'external_id': 'AXA20090.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'DTee9G4M8sEfnM4HaPfI37rT74pq7M_G_57289': {'description': 'acyltransferase LgaE', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'DTee9G4M8sEfnM4HaPfI37rT74pq7M_G', 'external_id': 'AXA20094.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL_90836': {'description': 'cytochrome P450 LgaJ', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL', 'external_id': 'AXA20101.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'nk3UJUyLaWr8LochtohJ9L6eugdChZL9_90318': {'description': 'transposase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'nk3UJUyLaWr8LochtohJ9L6eugdChZL9', 'external_id': 'AXA20100.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh_59909': {'description': 'trans-AT PKS LgaG', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh', 'external_id': 'AXA20096.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5_33004': {'description': 'trans-AT PKS LgaC', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5', 'external_id': 'AXA20092.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, '4_8182J88axMDpFJBZI6kLNJAu8Ittm3_95865': {'description': 'squalene--hopene cyclase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': '4_8182J88axMDpFJBZI6kLNJAu8Ittm3', 'external_id': 'AXA20105.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'RyDIaUZc_b21_kQalx7J3yNO4l5f-439_53400': {'description': 'trans-AT PKS LgaD', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'RyDIaUZc_b21_kQalx7J3yNO4l5f-439', 'external_id': 'AXA20093.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x_98927': {'description': 'hypothetical protein', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x', 'external_id': 'AXA20107.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}} diff --git a/tests/python/classes/test_socialgene.py b/tests/python/classes/test_socialgene.py index 4d7fb32d..70d72fd3 100644 --- a/tests/python/classes/test_socialgene.py +++ b/tests/python/classes/test_socialgene.py @@ -13,7 +13,7 @@ ) -temp.add_assembly("myassembly") +temp.add_assembly(uid="myassembly", parent=temp) temp.assemblies["myassembly"].add_locus( external_id="my_locus", ) diff --git a/tests/python/cli/test_export_protein_loci_assembly_tables.py b/tests/python/cli/test_export_protein_loci_assembly_tables.py index 47a70abd..48764c30 100644 --- a/tests/python/cli/test_export_protein_loci_assembly_tables.py +++ b/tests/python/cli/test_export_protein_loci_assembly_tables.py @@ -35,9 +35,8 @@ "loci", "locus_to_protein", "protein_ids", - # "protein_info", ] -hash_algo = ["crc64", "sha512t24u"] # , "sha256"] +hash_algo = ["crc64", "sha512t24u"] include_sequences = ["True", "False"] @@ -72,7 +71,7 @@ def test_gbk_parsing_and_flatfile_for_neo4j_creation_collect_tables_in_memory_fa # DO NOT REMOVE, used to create truth files def create_files( outdir="/tmp/tempsg", - tg="/home/chase/Documents/github/kwan_lab/socialgene/sgpy/tests/python/data/test_genomes", + tg="./tests/python/data/test_genomes", ): from pathlib import Path diff --git a/tests/python/compare_proteins/compare_proteins.py b/tests/python/compare_proteins/compare_proteins.py new file mode 100644 index 00000000..5a6d0097 --- /dev/null +++ b/tests/python/compare_proteins/compare_proteins.py @@ -0,0 +1,48 @@ +import os +from pathlib import Path + +import pandas as pd + +from socialgene.base.socialgene import SocialGene +from socialgene.compare_proteins.base import BlastTab_COLUMNS +from socialgene.compare_proteins.diamond import DiamondBlastp +from socialgene.compare_proteins.hmmer import CompareDomains +from socialgene.compare_proteins.mmseqs import MMseqsEasySearch + +FIXTURE_DIR = os.path.dirname(os.path.realpath(__file__)) +FIXTURE_DIR = os.path.dirname(FIXTURE_DIR) +FIXTURE_DIR = os.path.join(FIXTURE_DIR, "data", "compare_proteins") + +sg_1848 = os.path.join(FIXTURE_DIR, "BGC0001848.pickle") +sg_1850 = os.path.join(FIXTURE_DIR, "BGC0001850.pickle") + + +def test_DiamondBlastp_compare_proteins_dataframe(): + a1 = SocialGene().eat_pickle(sg_1848) + a2 = SocialGene().eat_pickle(sg_1850) + z1 = DiamondBlastp() + z1 = z1.compare_proteins(a1.protein_list, a2.protein_list) + z2 = pd.read_csv( + Path(FIXTURE_DIR, "test_DiamondBlastp.csv"), dtype=BlastTab_COLUMNS + ) + pd.testing.assert_frame_equal(z1, z2, check_names=False) + + +def test_MMseqsEasySearch_compare_proteins_dataframe(): + a1 = SocialGene().eat_pickle(sg_1848) + a2 = SocialGene().eat_pickle(sg_1850) + z1 = MMseqsEasySearch() + z1 = z1.compare_proteins(a1.protein_list, a2.protein_list) + z2 = pd.read_csv( + Path(FIXTURE_DIR, "test_MMseqsEasySearch.csv"), dtype=BlastTab_COLUMNS + ) + pd.testing.assert_frame_equal(z1, z2, check_names=False) + + +def test_CompareDomains_compare_proteins_dataframe(): + a1 = SocialGene().eat_pickle(sg_1848) + a2 = SocialGene().eat_pickle(sg_1850) + z1 = CompareDomains() + z1 = z1.compare_proteins(a1.protein_list, a2.protein_list) + z2 = pd.read_csv(Path(FIXTURE_DIR, "test_CompareDomains.csv")) + pd.testing.assert_frame_equal(z1, z2, check_names=False) diff --git a/tests/python/compare_proteins/hmm/hmmer.py b/tests/python/compare_proteins/hmm/hmmer.py index 3d8bedb0..e482450a 100644 --- a/tests/python/compare_proteins/hmm/hmmer.py +++ b/tests/python/compare_proteins/hmm/hmmer.py @@ -3,7 +3,7 @@ import pandas as pd from socialgene.base.socialgene import SocialGene -from socialgene.compare_proteins.hmm.hmmer import CompareDomains +from socialgene.compare_proteins.hmmer import CompareDomains FIXTURE_DIR = os.path.dirname(os.path.realpath(__file__)) FIXTURE_DIR = os.path.dirname(FIXTURE_DIR) @@ -49,7 +49,7 @@ def test_CompareDomains_compare_one_to_many(): a = CompareDomains() p1 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] p2 = list(sg_object.proteins.values()) - a.compare_one_to_many(p1, p2) + df = a.compare_one_to_many(p1, p2) expected = pd.DataFrame( { "query": { @@ -71,7 +71,7 @@ def test_CompareDomains_compare_one_to_many(): "score": {0: 1.5, 1: 1.14, 2: 0.78, 3: 0.6, 4: 0.52, 5: 0.18}, } ) - pd.testing.assert_frame_equal(a.df, expected) + pd.testing.assert_frame_equal(df, expected) a.compare_one_to_many(p1, p2, filter=False) expected = pd.DataFrame( { @@ -161,7 +161,7 @@ def test_CompareDomains_compare_many_to_many(): a = CompareDomains() p1 = list(sg_object.proteins.values()) p2 = list(sg_object.proteins.values()) - a.compare_many_to_many(p1, p2) + df = a.compare_many_to_many(p1, p2) expected = pd.DataFrame( { "query": { @@ -335,14 +335,14 @@ def test_CompareDomains_compare_many_to_many(): } ) pd.testing.assert_frame_equal( - a.df.sort_values(["query", "target", "score"], ignore_index=True), + df.sort_values(["query", "target", "score"], ignore_index=True), expected.sort_values(["query", "target", "score"], ignore_index=True), ) - a.compare_many_to_many(p1, p2, filter=False) + df = a.compare_many_to_many(p1, p2, filter=False) # 22 inputs * 22 inputs = 484 - assert len(a.df) == 484 + assert len(df) == 484 pd.testing.assert_frame_equal( - pd.DataFrame(a.df["score"].value_counts()).reset_index(), + pd.DataFrame(df["score"].value_counts()).reset_index(), pd.DataFrame( { "score": { diff --git a/tests/python/compare_proteins/hmm/test_scoring.py b/tests/python/compare_proteins/hmm/test_scoring.py index 0203dfd7..65db32d5 100644 --- a/tests/python/compare_proteins/hmm/test_scoring.py +++ b/tests/python/compare_proteins/hmm/test_scoring.py @@ -3,7 +3,7 @@ import pytest from socialgene.base.socialgene import SocialGene -from socialgene.compare_proteins.hmm.scoring import _mod_score_tupler, mod_score +from socialgene.compare_proteins.hmm_scoring import _mod_score_tupler, mod_score FIXTURE_DIR = os.path.dirname(os.path.realpath(__file__)) FIXTURE_DIR = os.path.dirname(FIXTURE_DIR) @@ -33,7 +33,7 @@ def test_create_tuple_type(): a = _mod_score_tupler(*[i for i in range(7)]) assert ( str(type(a)) - == "" + == "" ) @@ -74,15 +74,13 @@ def test_mod_score(): p1 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] p2 = sg_object.proteins["iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh"] res = mod_score(p1, p2) - assert res._asdict() == { - "query": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "target": "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh", - "query_n_domains": 31, - "target_n_domains": 40, - "levenshtein": 0.72, - "jaccard": 1, - "mod_score": 1.23, - } + assert res.jaccard == 1 + assert res.levenshtein == 0.72 + assert res.mod_score == 1.23 + assert res.query_n_domains == 31 + assert res.target_n_domains == 40 + assert res.query == p1 + assert res.target == p2 def test_mod_score_no_domains(): @@ -91,15 +89,13 @@ def test_mod_score_no_domains(): p1 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] p2 = sg_object.proteins["iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh"] res = mod_score(p1, p2) - assert res._asdict() == { - "query": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "target": "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh", - "query_n_domains": 0, - "target_n_domains": 0, - "levenshtein": 100, - "jaccard": 0, - "mod_score": 0, - } + assert res.jaccard == 0 + assert res.levenshtein == 100 + assert res.mod_score == 0 + assert res.query_n_domains == 0 + assert res.target_n_domains == 0 + assert res.query == p1 + assert res.target == p2 def test_mod_score_same_hash_with_domains(): @@ -119,15 +115,13 @@ def test_mod_score_same_hash_with_domains(): cpus=1, ) res = mod_score(p1, p2) - assert res._asdict() == { - "query": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "target": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "query_n_domains": 31, - "target_n_domains": 31, - "levenshtein": 0, - "jaccard": 1, - "mod_score": 1.5, - } + assert res.jaccard == 1 + assert res.levenshtein == 0 + assert res.mod_score == 1.5 + assert res.query_n_domains == 31 + assert res.target_n_domains == 31 + assert res.query == p1 + assert res.target == p2 def test_mod_score_same_hash_no_domains(): @@ -136,15 +130,13 @@ def test_mod_score_same_hash_no_domains(): p1 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] p2 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] res = mod_score(p1, p2) - assert res._asdict() == { - "query": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "target": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "query_n_domains": 0, - "target_n_domains": 0, - "levenshtein": 0, - "jaccard": 1, - "mod_score": 1.5, - } + assert res.jaccard == 1 + assert res.levenshtein == 0 + assert res.mod_score == 1.5 + assert res.query_n_domains == 0 + assert res.target_n_domains == 0 + assert res.query == p1 + assert res.target == p2 def test_mod_score_input_error(): diff --git a/tests/python/data/compare_proteins/BGC0001848.pickle b/tests/python/data/compare_proteins/BGC0001848.pickle new file mode 100644 index 0000000000000000000000000000000000000000..066378f0f9b62a1a10264386eb1af1c60a738b3a GIT binary patch literal 48364 zcmb@v34AR3Ro{uX=V$vFV;f^0i+x#aY+~7J?K0T!mb$8|O0}rEs#L8KW8`kBrS6v0 zEwx%J39(rN+=PIy12~DZm^c9(vp6&45g?ewg9FTr!-T{P2e&=`2|J2)_^Uja`@TaIh_de^Wx7n`m?KHd1*Ei}%&DUT4 zeE;6_6ZN9ai}&Ag|Fv(w|5xt6=H5#V`n^H3-QC#i?H||{UD;;af>F=El!h`O~()`v+a{t~7nrDM%zgyqikY9Md zv9>ZjJ2g4?`qKS-&mNo~H1FSgR@vlz&qK)Lqm4m*=l;FNg7)Up<_}=Ei<|YiqB$(c9jrv%TFxdxzh8$?#Lc4UwbWWzEN)EssC@*ml3Z-#luy zw+GEF-t|R;UT@>5)$0#7hF?{091jK}WuY+V8#{Fo?)mMmXp~$MKRriZLy*4f-m{wB zt^2{StykT9#a^$wGqKsKx4U=LMwNTU zoul(Ei=;NE^+jRpHDT+uVe5;-)?W%+uM1lfVe9o_>kVORGHjK?)>PP<4xad#)=bd& zNNe^6kq7HXWz*ZNwz}J9bGx&Bw7I{$op-W>lh)o*)AcV!E-g;pDDvc@i2Tz1>D|N?asE#jl|zRaOV!zjFqF^sFU<7t%^U>wwIi-OPHU(=@J$e=j0MT zy>))j6K!a3*7xod=KHYA{nk%MR`caIszujK;##zFxb3xSt-b8{q#t*~)3m(d_iS&) z+Pf5VYO-{bpo>$JBIt|vn%F%LztIRL7FU*y_;|)Gm$HSU-o)_%F#H|p== zl+xeVc8@prn)Uu2brA&f}QumOJm60$yF-e`c8!Q@4^U+E2&jpfssmFd;$ z=25cMoZ8-*@AS5}mo8Vjw0N_iB_-$=fshXHZp|+7eye@Z>Kz@l#2UAndxQD}ukJVd zPs7nYvW@I_JGZU##^g%NT_~Sc4tL7+#reamIhg1)R*m^uI8~iqtBi?1f1~&di&}gY zy^Bz|)oh(_^?M)=1b)K^tK3P<_+j<@O)p3MKYIK&@sGoPxnoxw%|-vnUp(G-s6?-Lr$KmFAvXKbT85dppO;Qnq3R-Q6)+r*3xDGs;!Jw1TXSc6SSO^mMK7(|-Xu zKm5x#%2{vZ_4pubE}hP;F0R+t+Ex&nGZz!f^Baxs=~C@d&eFooa!x8aUp)%n-3g@+ zXXlM(_o63CUHrWFG!*(5Uy10uUwE5F^!nMv!rb|wls2r>nr#}ptJ94vJZR3$E;@Uq zF?*aU-Kfxu(@LRVmc-Fgx%@xr zjf$CWjO*{z&9Z{}Uv=-52Q6{N^-Zw+pb7p&dC&KocZ@rG?VYEgxZm`Ji2c2=hYV7KCd8a#fk#AL!YG$oh(s;FN^hy`?#DFaL!b1wIZ=tckOoi>Y`baX0oa@_FZY(~sBu{i)jSX-cA=j74x%9OircJ}q3!Hl=RUmutA z#syzkR15wW%A1_$AnVt?;5Jz=X6F5Q|HPb)*7x&MZ_Qfi&Q)h8c8|`F6LT%e$7Ef& zaZMK|)tXN4?49?{>SAnfXzVq6`{$d`0Z>bA$cgS|a~PxFu+>I?G$?vlCI3#7OL=YO zwo7RaR{iFId+JRa3!CX)`Cw;0o+ul&WjD$5^GT?&p?XYB-W>F%RM30z_Fny@SEQ+T zYW;@su!6qoA0X%l-*cliu1>jK%byu++S5C$%WFotSDsy*YtE&~;f0Y_T9;Cm7HE_@( zt%CC_>wQ89Y*_P7I3c0rd{3oP+X~FzA<#S=MfBZ2gxJ6HBR8(^^44nOWP5dC+Mh|o z{nfone3&0@SGS$UTzhSEJsFdE@@AQ*l+3Rk0XHE6clspR`*618-OW4AGnE8BO(x@g z--7g?@~#`DcULE}h4qtKwpA*Z)+d@Lr~Y=d;q5M`^ZV=N6SQ!o!zJNVZ{Xy}%U^X* zpnT-K@$k6ag)SOnN*{8ifAdF>_4z-3qpXwhLS`Ku*p@LJE>@1M%sSuqj@R~U{r#5P zur|kJoxC{^mQ*18ykVk8ayO!1wwlnr4P3?33E$t`Me4uTzfG-A7cOS&y`Alg`4j(Q zE^SAHsoG%bpmvs&xA*2Y#slKw&5K-8i~RWofZuNpTIYK^dk}T#`NW9+d>RojdJB^O zYW+6JX9fqW6PDR<8_}7+94$8;XS$M3PS501TVAO>=Hf~VH%l(G&8zOc5S|y>yZ7{H z!;=|=JYT#`$Yye}cRm;#+o$EdL9~6eHZeKxHr9>@C(9?5wtYUX7dK{z{yvE1G~>v`wrjfqlFTfUe*oXkV}WK87J zjS+isT1D*tRgUhj{2(&F_lItlIW7(6PrJwVMQe2~nOv&vbq8*Kxa0-F+NpEWxRklH zcr&|IQtZ|j+^oe<8;*XWj@+RGS%pdMk#;|9^!vX*h`(i$v+b3pTKPlDrcGjb8yOsC1qw3MI z>#Xi~F9j~m-xvTF7IgqjlH|@&4_r@`p@(y$0wJukgg&Y!N5|Aoo*r%c<1gR>sxP_i z0^)Xas@09x?el7*U!Lq`b0^-uH*pI+Q1zSFiE#{N-iR9YJ~zuX9pI!=AQp9lNKblu3t*MH$=g_%?e z^RhY>G%lbGP5$&#s^%LJ_Qgv#dXR~ovq|gZur*U!?96ZNoShvvH_scJTib^{XLh}F zaJkT>`5U9j!lH^MkB^|z2q8U9XcWE#Y5(p^Z(G{6*=p;wzB+Y6bUAIDm)5=da%Z7q z?3(9EeIXgccBRFeBTGp|mM=C#V?-X3qfk;i5|A`3N&HVlnM%wR{8D<`6&&aNiAMAE zpzrR?C8wLSR;_#Cl@F@z^f>FCw)>Y$K2^F|x2KhEPfkzt&U&iadP8x1n1kQ(jKp=_ zhjlW)@P4l2-9K>Kb$E^C)3eo@vDjLw?;G8V`9|-&y}5XH?9b;1Ta|IuS-27VEi5YR z_lBw0O+y{15HvCNc#I{z!lUcUc!*T}!=J--z?R-pr9HGZE9I57$T`^FFy}xgQM0~a z?q(ZCt+w6?#;{6h{zi>jSX3JI#MH!Y6DEXBd?7n=t&tAoS2jY|J`X8AY_7W)U&wf*B640hx}OE>43(<;aO z{QX{|y+^*Y*W8ziKw;HyDB{4;(*U@Q??&=}^8OqBQ82Mq4JZ8dhMiV+I+Oi_sjW)a zJ*q4=s{4zZtub6aHFdLb%qZh{&9wB2Nb)eT**@aGiTz_bG{lW=o^S5;o^~1OL%)ph z*?+uI_?5Z6#rf@{G@n>XSI@Vmo9mOiK*_;Lf5}^JHrFrLytHs**j=1dVRvR~LNaAk z>3X(pVs2ujPEVU?dwq!u*`B#ge>#o%_Uue|FFrfkEXTXq=0rMe&zmjdX#2lBaiooPi{5k(F+yOV%c`Kdh(Kp zM!OT8L-$+H({esq*rha+y{7ZGkyCoPBHem6*30#(&fk;QUfKCO{B!TQd>$=3!``n) zO!edC+wVQn+TT}=TzB00i`cJiSaZi`C#%V%x9ZJ0tBJERPcO(IcXFE3n%$F)?S5~c zhNF9r$Pa_weea(lee-Yw@ivY&A-C__ch7$BXZiDUyoXf(xOsp19!)g&9&K+7fBygE z=ND+%HX7~v(S2LD-y!m2#nZzNtLNXrpLgARZ1^n?y#G`bpOZVq5NaxphpZD(0C-^7$ z%4c-`EB}rzzVqw+q%OSk8}jG}x#Z4o$~%Tv+W9T@{)0Df%b(6~UwQ3!_{sa_&hPS1 z@O{tdd{TaVZ|C>q$Ma`&exHAp*nhM2xk?xAJ<94FH{oFKuf0>=6d75y8kx1K=|o0u zSZTRx_(ozxk)K78ZDp}tiL1V!SUj7Ww(t8@-!{vJmH4?;PNUdK6Vr38%r#?Ph~=HB z9fz48XLcMIcC})7QJhqb)T%kA;U{jCRASo++)zH$i0m}>AgJOvsMUyhV45ngJGm5FP1agPBO!`EYmk)Cr=Z<%y;@>WXnqU zu4U8=&rFRpvm+Z}5<5zaIFb)g9;fw*&cA1ahOK(9YW*fI?7wzC#&h&%UZaLbvoj~X z(*^q?%lxC$mffCkj~Dh*cX4`svft=N_q7^6ORHgLsD{0_DLvC__@pSGQbYCVIyLN_ z+^B}n)N1%;g&ICvt6^(Q4X+Wqf3aEjR=WQ3R`2j+x3X7Rnw{S`KF{mBll#j%7t4j% z&rxD~uPcn_iH~b3l=1w@NNOGw#`DC_PpdzL@yr!hsf_1|UlUL99q;NWj zh!Wa+?G@8uFaDTyU;FfDVlifvp%__a8#byHWGVI{TWn`Wndi-_U9r+6b0W_-bMz=q z0;}T3)r##K=pWk5J0r`94Ks6VNy^i<6}f3#v;8Q^(yCR9W7n{p(6rNPWLRiS>?T#$ z^ew(PHsW#|TbQTmV{lnqF%v(nnkFC1*UERKw%BbJCsvM_Wz#kiDwNn!X1ge)T`{wW z@3M_bkU5n!iA|r+aJYoHQgiZ(QHk@+b&aZNWoBASF{aSY%qX-frWHllk!4hT-%Q;Y zO^#jD%Y2`Ysl}0936LPpW5dRRxiT}sii1j&+D2{%VTuH%k(rrM^RWF)+>32T2A`JX zreVb~YZ}L4fLB2vu8+z(>*M>eZ5xp+kimD$J8dhp^4NAQ-_GMau@f=Z2tUDUV(3w9 z`^%v=ep`S>;d6x@4(?-$+?rBewjS#qw|| zab!9;m0FB1z&qkgkRCUMT(&aUP6HLWYO|efo+89<$}2Zp?%u-k+VbXUdcJ=#HRoMa z&epp7sj<4U*dTnjJrK}HSXQ4l}$E9i_xuz+3=?JMQiIG+ zu$@ep=p|>9mgOuc4yF>O=z6u5X1R?A;=(H|LnXn`6AXw=s6=KGR=f(vYGI(P9Hz-0+N@sU zV(u7*i7^^hfOEkI$ycOy6tOK>xM`WOX~$JFHX=+j(Kd4NR2%t9Z6jZCyNw*1;mPLC za(3ZPHp*v%z+5_Mu3QA?TT5qIa|>2gN4FR2FwrW)MDnCoc@-vpV`L)gkus4dKB+8Z zx(E}=`?N{vFfn{>JWQtNO_+tJPYla&nqT#sr)QNLcj;Kp)!nMafXq3=Jb56xEQ=&pO3=OXu zg;EI!pC`Mj0Vyp-kX;&P>4lqUhqX5J_imhsFId&Dt zf=YmP#kC=j-q~+>U}yh2JNu2d?Cku3Gr68t*C$Gy<(At%DId>mq*33Tm{(M zRP{Z@&Mx_Sr4BN z*U7y0V;b+?Ec1WSGQUvE{GV@?xgD^$CCIdNuRrk8jhWTG<#$^7jk<99ml39t!U&T7d;>B2XCG+UD@pb;uE9?BvZ!Y&AExR@~{MZD+aE z+4J1>Dm4;fVqk2E!-xSjf@;&CDp|AP zn#9mFj1!ko8~UM3gdi(*wVi+L0XzTC*!h3B#m?Q+*dQZmRqgJ^MsJ}V#@X^x*WR7- zyU|3gwO;JyEA?LfT(OsLjr88Az5Ms1y;P5`)4#X6diSo;zao2jzt+Dy+Gc+C>Rx_K z@8z!qMvz9X5#@2f!h_>T{SDIgIC3Ij0*jyFc&l+vx)fE&ldL3S7kxhpGsBYz377;H z0rc6bY6OS_g_vx&&+p+g`I#td@{9`rAJBDM@Ng!k}4ky$0ASGB9|eCtyo z&fn4y=N-30oQL!!&(nMP;bJdaKQZP@K0Mk>^+@gI-A~gVqwpoIC&%`3^x9Z7_I_CJ z<@5Aj{^aFeUgNVL(wDqI%lyZM%!>s+Q2O_0Bbn7B?Mr_9ip=v8W=05N_}aM4|4GaI z0xk3J+$!@!#+er>nS*Bz)2-Hj{PPz2^VA!LShDpCnxarfb9eaKxX{0*g?^C|I(Wud zwsp-)SI!z~oOds#*Ul$rw`aYhE#In|lV`_U`UWKn zT0PS0@j`jSD5Rf!#<>*#GBPrtxg|^oA*D>Z#Ia_JcSwkcGs^^5Mw0LmB;+$3B334G!;GsWVreDe z1(PJt$r30Ul19=$vIN_3Gh$kzek+1$A+f~;B_97bssP%uZ-Z{hm~A&Mhgp_Lgd+X~ z5Eo>(
    _V&+WI7uCg9oC~ zKO!3a!&{=!(Zf-Y&(M~Y4s|qt-Pm?t3*mlU-hczpurq&TOh=y>zNWVOQB7q0Z$(1! zENxoJ?MfOf*N%^~+IeDk&fVpus9DB97M$y^Fg zR?e3Dr|t9eX1`X*te4vRSRwNp3yE%aydPIfJ51dce7)|7#RopM&rp)&~k)hyw= zp+(5@YjM>|YLGr)IYi?UbHlCyYv7oCmmd+wCQ}8d#3qCcWfEaXA#DVKI@oMfsTw4I zVHHd%)FRljEF2Kri!esAWVuCF>*EUGttdI*E$~4>#Q}z3>s5RPW=$0{No+u_W7Qz4 zgjDt5s=&@kQCRrk16KW0SoM$JV%6L3DW^Bv7e~8uy<~6-K%Vt?-Np9c_#{t*+1*0b z-k`1ebB9*#|CrWyrE9-8vJdr0IdSzee-&2k-y74l(QC>+-mR^=Qdo7Qqx^rns%#(B z%JzSVvL#l;DzH?(U9CB|NyyyTuUZ~kV?n;uK<7zRAwfq!5)=r04zJBZ`Vl_etMYy` z%%ci4A%G96OXwNLVr5gq_M$Z6BtZ()tU(i_C!{1^%>{l6!w+_h(+Hjo)v?(eN<(}N z{+w6=9tHaQM!@StfjASli<0aDdVB(YkmShWXZ{!da@*gWAEg?R@p;G^< z2@rNIE63|YPzYgYmEpf*w7iDC@jF(Qs}dP>DJlXzg3_mv55~S72FJ0%9>JJLu@!i!z07uDv5Eb?j_F?2m$O+Ur!#v@w5t ze0JJvobOLJ%}(a;%p6azwfePWYoX9iO`dyySUAGf59tu1l=Huilv6#@2=tRzocF5u z`e6n>A`O*tKB1NKm4$MCsaDQEysDf}>J;ct;kB!zFDAhrm4Yh8L!n$a3_rHYER@t9 z%Lc1XxktPNwO1Cs$@cVL}t(o!D{X45DFMWLI(fh9t}mjyV8 zGlGZ5r)3qJDpTZFsUCxA3=c0xSt^Pw!8VjjXs*MRgGMZ`pt3&tzy|*^8~ls6Z1Bm} zLL>GsRxjq`lZ}mLmT$~Aa%*O()7;n!)6MxS(K+Z1-OtIt8*@K@YqZ1a(RJ?U3rS0Sdt;A569o0a)+jMTv>5TJdOEi5TU#~5su@RupE)AGe;DX&xAq@Ye! z(@e{0SfSov*}zykaVTp9u3s)ER_Y*j9z-b_yU-vuAugdxLwJLU33F)&prRnvKp=5a zwY>zbp$ZdcG!5`u=)TLMB-B^nc|zaKsAbkL4^)v1Kh3CiAR=xzWuYiOL>a2&z}7^r zX;sRuQs*M5>YU? zl8_Rn)GG-sRHVok#3SI~@v>kV`~zU0C;brZpn%HyNi_^$;dlfKR?u{8FpU``tU164 z1&AFg5Y&D=@&by!D2t)aHlj)|YCrzkMk7;A8rW)nfnsk-@!xwAk9f zMtJg2NAT+i=p7xh|9KIz%UbNeH72%{yOlC2llvYqIgh*n)-S7iR7`2-siwrx43j}! zY1aUk;aSNwpsI1pb`qy1RV1_%_X~cfy@%=_sSbV+ViR`R5LOZ01>TcGWKuzOP<7Wy zQ_wqkfe*|R>H&boo5?L`4#IcZ6&MNd9jONP#nU)+XlY`-!kMrGWy{U!LK0E|!-}YK z3msjS4k^l08TJ>Dh5S+7}$%?h9r%PX=on@m9s1thA1kxH}ca?W0l@Zkessor&YZu70AhtBYq2G3AqE z3VI!;d_Y03V9L>J+Byc9rqeR+1@)k z?Ju0J6h7=s+tptl+STUYSKqA*XumMBEA{BwK=uxWjIRr1MFZHU6n1s4?TT+2xdX8) zwK2-B{)V)XpwAJ8Le5Gd#AZ)P1^~2TGG)R6ibanfoEj)Z;uvs)gq$L&Kp)~A=@}(k zAOf945DM@E(kx*Fr2m%mTu}^=jz;==c!!i;K~I!)6A};DG{O>a#FoMlQOc&_fz}E* zIZ;Bx%qc5_J;t0Ls1>Y}Dm1}}77vIC{s|)&Ad|;V6ovFP0exs9 zsraE80RuzJ&jYNAREooSMjVtW@!C~@Iy{4iQNjo~;Axldl2QfTV$!e(Pz0^f`a%44 ztq`rJ>_vW`@-92a8)19Cv@im?a67b8Q4xn2!k0)}SSmpr7ev1bZ}B4R5)e;e%EJW& zqzo`E(%uDPkZuKB33&uf3Z%S}J7D>UK!XXOj{Y#p9wfXZxFNY80*EW1|Hm@Z3h^53 zQI&KRSdy@K(wqi9;d9w5so~M;Xw!Cy58$H%WCg&aRKNwE2-%Xju$oR)oC%9vq5LC` zCkm@@Lmr)Gq>T6oFldDm9vL|H?%ROF8a;gJ+q7wF+A}Q-RkPf~*Ob3_t|sH& zuF-=|P`t0aVs@=_oq~Q%P-FR6V4#_}Yc8H<6tn4?^tJBz13owi^BCqvCMyNQ-fM# z{wjDVeu zoJ|{x!`{B40P+<>nF|>d*7>86YFsvAjN~dBF+OmkPcC@cww77fn!GWYg{S>ptr~BK zoq$4xg`hVkS0z+XifdK!l$Ij`x#Wly8bBFVBLJQRL%J$$np|*)_%4aVcx^y?>^eYu z0FZQQLU52q(kp{gP{vALM?WmQD0#3Ql5v6`1fl3g%9infURdbG(k*(iQma=x%%SL% zd@t&iXPmRi!CExCoA00PH_jTTgjwO{jAw9{{gWRb0m>(y6ipc6;9Qw& z(SZNNM~l~#Vmw>h--%x2-`8<(XDserqZxb6>EpQGH%bSGgZ1d-xN9%A))p$q;Zi+0 zc8rpiI(VKgI|OeZ!i={T6*{FGs${PkwtwwX?2i|MYTc+UiljyJ$lVu@i~UM1_SY)0 zgU>IBjlX(3%!rnc54+wV-ans-y_J5pT-rH5aGK_z61d(>?iX_ZSpknmayyUdR7lDF z|BrN<2iKo>m{Te}eI3qtjh6eJTJBFPh z@+C!9r@i!*<-ViE{%cz77Zj*&drVEl7!A!M{dLuA?%-`n-vRoy3Ydpxb4dgo5)MMX zlm&>sIA8)ie@jG*rJpc^>6X?NpSptJC_zaQV&TKZ3s)frF?4K?5+$#|tx%NVb)koW zK2R&dM+4SrY~?LwLL~493x~GhSNYEZX$V&5As@>-f!;BFj1q|v8P6|$Y^sA1UJAMy z@Isz%)Afd@hy?N@fdpvzH1a7)pF|SsLI53(i11DDLbW6pMjO~bKOI<%&<)p&BcWvK zS|m7W=0Na61Vm+a9dyFcFCuOCs47r&tV=)^%Vh1eb7r^axhKsr*N~lk`&y zozEhg)E=lTRY`#mjE*B%5F#zdWW*ukpfjm12q{-{aO84=#bs-FbEG=t{Th9#wEuHC z7EzwaO<{l|DOoKDow5`9kyEv*c0dW)r*^>d(#}NxcH$V0W>}v2)U4_eomD;S_N;2w zTd#HS`t$3j7mJ(ojncvXxorzs!p#h1ZbQWJG`lA7H6wtxf~W1;>BlA+VTQsF1U94*oU zMmq@7n0j|!8@5a+nn7f?}vL0W9PIt zoAoNSm8op6vsXLKX3vUH`6|5&UtR1%a#wAG-i6;B?ZRch;@u~Tu%bgwTm&G!3x4s_ z_%1Z`F1$+b!dG401$A|57rr{EkdBG(5kCX{%a*XF;&n**SQIiw8&`oh0&c4Ru_5w4 zO0DDuLXPnNQ0i#{NM$WY7z zUh%KKeZ{Ai2L0x=lTK`}9EWqWGo{e=))<-Wq{nT0^C&1*{5$lDf23IPVKGz%%j=4# z4`36&*devzqu11m7aZN9Uh%)JSNtQFD}D_$vGcPsesbb`XKR0@##H;s#k7mo1QTT| zKXa;u%-^PE{;5Ld8_s0i$SXeaxF}5lOs>kT+c>;AE%Uc)nSbh1X0cVZBT7qtiDcIe z@UCW6s+tK&AMpz0X%ZvYYs;2BwnTRbejF5pz4kd2Q4Kh1fs-SVhSMe>k>!!rSmh51408P4IAZ`!N(YHK%P&hCjeN& zsJxr@1E>bPdXQ0}L#;hBq>y$UdWfo|PO%{++6t#2#dpxq6BenG`v4>kfW*tUK;p^6 zARP3qMr&`oUhkQQ_5I1I?WNQ8dV2=E-XHdVzfbSrZxuV}y=~0SukE|~+yazoJ0HGZ z+xhUdaXZ({u=hT_gTH=d2V1U2I>HA9(i!iCHD#GBY~}~jV~eLGTvOD>s!2e^_VK~8 zkA~PL&7}Bdw#F|Li*w9eC9zn0&Z!&<5>oi4jt)iw2MOcK>cSvFkFdDXH393whtW!b zbAjlo;T@E#k;(^@CEr2ch#V@lEU*cR!0|i^>?Fk&OeU10Qzf-^y^J^GK&bY|9z1O| zgJnT6T*@VQCpuavvXODa2zT z7@w7L>I|-#Yv4}?6GHPcx|g8&ru4)Bsz7?+1_iq8z@d4Cb;A4c#nej#et{1V6NH2A z7EH(YKcH9tJ9_m4x9HXJYB%Uu{^rh7-aZ|iB}d-zM)ksp7Pj4Bu|Bg_c!mP|Qxn%9 zx-+L@fDRxZ8U+yb=sJ$E^E#a?D8*7b$zQtfNT*uCXA~S`f&D-G0TBKDTD3l3REtaI zj2WF0M9`|7(I^XxoYf+r5r-1#`2|U_+=7NsCUzc_3RPc@D;%{E0^FoqL$p!IDwa}e z=%@~*O`4A4G6hL?lTHod3XAM{E{eyIEe>;0dxe*V$BqDj()U5f4o7?-@7R552_jTg z%$!m$Crz@v5+WUa0A#}!ta*aAuyI^Klu$JTlL@Q_GvuPrH4_Bx(h^kW>(CwaO3*9P z$kUwL@<4}>gFsAv$W<>uv`vkC5~M=a@=`6PfQeL{SM{gt#x$LVrLD z{U`qmgbtolSl?U!9I5}?DDxf#UoCazu`%m=;)>Mo&`bSuO6uT6MG9+=S>H8_?M*I+ zm0-u%nAja`$EQ1;YNLE{l6I}7{Z7_62n(n2Y^@-}!$@|n*X{wn;8N-b*4iGIdi0uF z>*r{x->28QD6ZbFAWgr$Gj%j`vgmQ}U?N&-Ek#CH+x7a^(#%CCZ51+Wt}*zU;ZlEP zLBuJc4@QnoJ-W`(edYI$Daci!i>?#qMSK$b^R(3RO;?I3%4$_HIJ;zn}dy*ZvzQm)6h=f+!OlWQ-a#13{ zcM8%X2wA+@4qzNia2OqBQJ72lfpgh5y>fLvQo|&x>uow zV3=}%*75x?hj^5R06_`hd7C<_0Vbfd6Vj2wDO?V&vJN0az7TK8hslA~ga$ep)nL&| z1@}M%rICuVs5E;Lx~a6%+ey5o0Z@Xz7II|bZ=zJ+RVFOJw_UjUQ-71MrAP4Or>;B4@C(0rYpLqE>P*7JPM2|4Lugk zAgq*}+^IoNkXjiYMVWwtE#(8LL75h(tH^RIU^CBz%V3ucNDHdEl#A4HW(vjZCB05$ zV-?4-pngKm!uqu5R#EJ- z$5Y8;cVqgX1i6WJu)6^6Fmt>xaHX#p&W5l{kalt-B%_c++rY~zw01~KmQDYX&~55y zGYxW;q)f<_o+T8A90OL#Mrw<;N@8`Hlkb=Va|$50$UOyGQYe9^qF93H)fDE%@~OZ{ zGa!ab8z&uHcG0H$_N%(n`dXmbSKXpDM&DjB&gK`~1*VBq=Vu4b`F^9`Y#ldS^|R@j znJZp4aEDrR`W0FWl*f9pa%!nQ~Lt(Xxfdsuo&P%>Qu4wC0-NV9!;8xyj}8 z<>vOT(K_wUH1gT{ME69>17|e(6=L^vB&G#eBg(i)M3mT=RN zC@mV|0ns1Y#QCWPHuKAFNU{fK!Ffffh>&bRxCu(U6yGQ|+X+3yOyJ=3A!?RzHiV{k zE2My0aRY;PLUcOe@UfKO7yyFqU;tH$En!Nok!At865k7*%W=5aRekMb1kx#-c@UF; zN7_)(bkYJ-&O1wUprT1=@Z-?572@vDc1qt&*|2ML!^82GC7VX|;fzTtC|T3lNSm3| zfF#Gj+mbeumtmoNw9qQj7mSi8DfMF90_ADkI?S)?vZm#P6xx+#qstg}nSf++a+{@+ zrc~k;*2s5gH~KHcSzdLc|B|P0qdzRK-0Vj8cURKm{(3j$W>pI%*Y6*1thy&#Q(<=^ zx=1F6lVyq!zCYBgi*MJOqBQGkMh2lCU1ty%_s7tZ`c?iKA1xDX6`J*^)~wwz&AP@Q z9DB~RXHPc0K_flwZcY2?>Q=Am9Y=fl1u@Sl3}R2Cr5`Bl!4cmN(o|AkR#(&`CH38B z6z|Z^-~KUul{$4FzNVaiK^QG*-`~~9^an0o|1~mSEUm3nJIQ9ptFG=%E|pGV=WsAJ zv47&(dxz#ladbO)et}Hu!?iY!#+=V(5^#y9M`1}zefOjKDs>WY1(_BNbiR^07#3G( zNxKrVlrpK1b%^p{iIM_QdZ2E=AP+3T%AphhEYJ>+1=twO0Avo@jiosR5D0vqHhYkd zA2T%z9R|-xZ@pkC(Q#@2650vZE*Jx(1;bA}3Iz;$Apl4;vSNm|)J!Fq!0!T!jGS(5 z2@H(3qN@p*#0$KaZ^hoE-7?8)(3(&iG-p8+A+}Hwv@H;DQXE0Vq#Jti2{lY)QHF`V z*JxT{){Pv7Ko}t4G&abiZILu3;aF4yV@@7X4Cf(RatR>>BHMNQ-W1i+)dG(Qp31nAFN5RGs&- z@ObwnS0db-)Z2@C>FBj($;?Kux?QeK zw$`)8aGKbU>!tRGOC9~Xc81Cg{my8q)uZc{I{MI9R)3xI^uAuZp`XwzZH=w8kibfn zls&I2dxo14x=UIDshQCxhG{vRUx4rR@|x|%ro~}17d{6Fm;_`K8MCSQBa|B2Oyt9K z31NyJg#V*upPDF@L28Bsb&g@`hWI#?^E#+M@i4(eRRNVZ@Cia3gGaFy>>Udwx!@gw zcx~Pmlwp_fI?{SXXLQ2MJt1vqS|bok+9dt`cI44IMx?e0|H#Iy6F;veHI9QqC*ln_ z3n!UO70*C3qz$J`hZf!^a15E51RIEd0K3xfh7+SBj5ab-KFDt11#gkB2s4Vr%o4-J zKt<7j4J051OgQ3B8in%Xi^;_&UHI}xHPWGGIM!nsNTF2{a z_U5uvKRSs@v$LIvlfqs6&;(D68+>ziPWR(l&JIb+ET8u*8-A$dSS1h zskMl28byU`bpIjkbywT#kgsq*sija-D;%oq^%9Yejt%KpHhjO1W%`$=HEz&SdvDUJ zzpL%_cD~}_BCz)-da1+VQU^z4LSNRIE>BvI4t;8&{l~`w@ZG{-wa^7@NVL#@rdJxr zFj{Fr(YZ#^Z&Ai7WCJ0B7)tzHYXIYA{aOCbUJ7s!hh z5pOBLXH_QlFm8fbc-WeRML;=jgp&%Dn289JI9w@2P}t$@6l4M& zjdB`_{syCB(U3^}0M>%$Ws#D>k#tG}fRP5PgUleVTA`ehlGXvbo#Aw)LWD?IGM-4K z(8O7ij54_gK82&>(m)OXr@N8!+Za1@`e@%HYo?3bG&OYYNI1O;ox40mpxu>MZVt2; zPCuQWINe>T49*S?C#UknG%r?qo3(0b!3}qY?ANQcHhsxZo3{SWnD_trk>RRG*BS2C zJ^lV`47Z?1i!(8=)7tcktJ>62*QdO{JPJT1P(0u%YUNNgEME-SuBte+=hIKh)F_@L zu|j2XJfclNaB-Ct3O_(A0W&nKFyK+*1_x|7XIBwCgeLed+#}1&WylmK>rfEUawsdhUx_YsUGf8pVpM3gm0~1q{(L)eitZ5Pps&W_!k`+^ zT`F8Sct*7WPWfeGfCpJY1fdC;Di+5y(Q7%N0E(oTKsAfA)vDVBKq#RN)i*qvdJP)= znN`P_6QioN>y1y1G*qGXA(lCA4skR-fUD=Bk_fBWh;n1Yw zy!|&8c&M!1jm~f`dm_~%rCblVs^R;!fBm?^8O7Q47im8DyR@s)JcaiSm!9gHnDB6A z{NM|U@v2|;tU~JL!eQPR6$&3Z>^^nhs|3A{UMVu1c_nr5g~e=x{8^WB3nEgxqHLn~ zENnuu5P*3gj>@J=3UkS+X4j_h3~`3au*>4cXkHN>fGi`WijKeKWokoG3!|Nd_hbMk zhV^hlodHKw&naG^;}kt52Voj1;05ZHNu|Pp;G?iE&Zh!^5;d3;U6ht#@n>*uxOTG! ziot}i3CS1u4ylrpTfv+NQ-Hn-XpxiJW$O2|j0JLn5yAj1w5u{wk?e@~a9|9V46?yl z3Lzl}mC1nQRGhd^2`hAT;Scb>yei{9F(3Zo_@I0{rVZsm4c1Sk5J*K%ioYD|;mjqy z{lGJRgXE6S!pN|F!9-MB)5rlN3WWk|A-SInz{FhSG@FggWfp>^1;^2SF4>PICoWkA zJ{`$$HpCAp53>#&$3zCi<)o&x08oo&hMZtL4qHmv22TwTq(y*Uv@%HxlG{})AXU^25vIEmzw5(;dV^%8k_ zA3yOwB;^+!@Eclqv&6*UyZq~CNQ~i zOSFAR-}07%@PFs96WA%j=L5dwlJj|hJ#k(==37RusSL63ExJi4`0}Ev&Np2_Z*`5{ zzF!^N0jmkOOeEX>_3s1lx|fa2kW~ONE>_@E_fiDhP|)F_=e; z0b1|_qJad$n0egvF2JLb`*3guQ?<)g+Qk{j!~SzHh8e>IONt8e1BT3iFw9$Ky^)9E zh|m~(Ri&E9s5Q7nhWSt&Kz%`Tu&ETjsMUzq602aoxPU);SY@gQbP3+lPG}L3bL(yp z(maO0aZyTk8>QoyL@k9SwcvnwiU^DW(qv>ap8+YjXnB!HjA$T5aI4IL

    3%Be-CO zh$FZ-D|8pN1QEasVfMUR-VpKu6he3jyODpObwqI^m$_pY9qU4334X)^nCFZrVyP%0 zhQN1%0)=t{o3MzOXeAFYR>XEK!at{>5RSG|$#MaYc~Zmu$aGhaW>*{(bqVoKp2Ouw zMEjU-1ByBD5i!;6__ zWhw|3&YH9RUN1j9TW#!bpB|*v>Ga`gtW(+GJq5%0qeB}#{LV2*{Cy)x{4sf?P|Mxd zX#`N5KKXKGqQxPDyKh$6S&C|LJrO7 zG6h*OA6m>APEI(Bm|MbT(@D!#4dz^l~!n~$Cx>~#*1ib6<2^u zpy>yzq+^i&V1{Af*f`0`r-Qu7Tlo!?o_P=%lVGqS2K<7{rEQIBIoYJtmHi6shH>t} zNQnWHf#8R@-GGZl(=g9;2*@HzJHoNzEBX8?rw7SJC9ca`xdL7!+x5AfgRBr8{2t{M zVj8Ygj_Fk8)MFB1;-Cmz5U>ilF>aPZ{3kAtWtRSX;&UQrZ9!CEE149Ai{PW~Yi4@?JMUW2-s0VW*4ey5FQ0rN>W|PAK zJUKKdBW75<8lEs<^gM`zs#m~WBN)viN2+P&W0VqgWWqM9CQ%618uef;a2~8>CS_*A z3n!Zs93vixE-bJr!>|q7@neQx$dPSE#g&NFR_zn6PIT$7tlL%^r zv+i;_RCEj51n~o30S7Rc(r0HmR20;~Q|g3}rJ-AaO2wE!hXi5fbYbU_ zMIsr8Q($3CBWDUA*FzpbW2hsxB}0+K84;xk?jd8E;0rik!LzUn0nBg45d0O1tA$XRYOab^pX)Yo90i z$w_a48>Q1QpG+1q7l%TICps&I9xF{({a}h|y|FXwNO%EqZ1nV^6}g{DLa0c;$pbt(h~ zI;j{MQKf~`W-LFXVaQk%hZ&7>`kLWf)Zo!H;Vg08(xP08%LFd$2`41kC3>Xzm)0LW zpdf#Sg8cC<3bL1V8)oBT$Jq{gwP1sz%BA(~iNodITFGs5Ps(Dk3)LoaqQ=I|Yt3L*sYAL+&Z_nalft-BnwCrRU| zH$!6t{_}^(D6qNe=N?HM&1ex5eoe;{G7ip;q9{$Y(rF~22BQ*(&cbF6t_J^7rB|rp zlGLi9oN^T12t9x+&2BKaK}2;1Mims#fQhi|mYo_w-Q@^6yfHvn+TTJt>2O?WnZgdo z@QsiTBIc;!;(UwhEa|=EvGg_3%tIxV*?WKyW^0OD13eIp^ai#c*uZ@@uye}>u9?fr zo3mTxMt5_twja{bT%VqB>I;Lwojg124+lIX+QNUS*uc2Jm&#gxdxSvMBh8yVDf>6t z!0N}xkld&qUlH}Dz@_jWcLc4KR|+umAUJYEa^^^?YSztWoD@JdJR zIj>N<&(<=#h0IHDQ$jz;^{Ea4CG*INbu2SJIi_@@*ObgfE#-Au=3=-~+Kz=Cpmh=>Ft#I6-I5K5nkC9tK`A|yE_j}j0^`Owc9S!JdkK|?SMFadoM z9K;_W9mDcyBA_=W5pRHhU_OMi9RaAE`}KF1=JBNL99dC7b*;Xi%-HJ2Tx+S58+>P{o;}#{Nu)>n-+BQ=i^<7kjv<_d)OB<&b9- zaCnQRt9m!Dwy>yQUp-!3uoK0hv4XE1yZ6R5dKYc1>|5pKetpYm@1L&PcJpFsVQXn> zGf0n3uRFY7NHHtmrwf_i`aT`ql&$>o$X3)NZ7W*QhE3^jeebxf3||}9yC2kgSM(hJ zpO<=fjm!@@OuJa;?<{04`(rX+%Zp!K=W?+x*F^AQoxe)UT&(lirOaY0-=b~hS0$&R z?!YvB%HWuS%qOO=lTpHGEpUi4wcOJN<^qBZZcR!S!n25J;K`+>lBkA`0s%1_C`r;e zX%VIq6JW*k7Sb0SX&EF(O@({XI4~zNyHgHWFhrijz@yy7-8f7t6LCO#2YpPXg_)dy zK?#=(8Irt>qcUJiJ`9X5Juj;I0aS%HNf`{vD7h*D0kOgSMD~hNl~acRGzXeompa}f zu^y3yWK>T|Tvvnc0rB>MSni#&NP*?HhAj`%CS!t-^F(sIzbS z>2-HwecX0_X{3khk8VVv0OsOwY7 z_a4_HN9kA?)(H-qKuE}=l~Lx)63%kPv6C^`vB&8nS}SO9l!-hvLb7BIWloc}Ka-S5 zx`=oq!=+>LByucVnS-Npx!)h7K_GT0r!ttIz9sq$iQQ5cfG5Mx(dx(vx-1lC%xAca z+=q{`fBYddb0h~1<2P)hOpz?jg)WC|qShe=4oZEJ!b^3u!0&Plnp{|jG|AhfMu$eg z=i@W+jm%{v5hg727$%VmmZk~x1xh(_gt8`}Lp({2haxcwSVm0&-*B*C4DdfV4mL~% zexXwkPh171N9-bDlyHsqbaHbpN~kLsK_S$oL4{u|XsK@$ExM{-e}kvcuWyo9Zq~1O z;W+5b9OkF9YjdrOiM8X>?4+4{%XVw`u(Y?kUFg@24zC|5)GIkszE->D%VR9+k#=*x zQYfGfugT-!0|0_{8N=7c-JGY@Yez$Z@4eKnYusFUa&wS43$4=8_RcE1vEOOUE+5eR zzaLghr@?+9^MBDcF=}tGC1XOXX%`R7z0e}FTw3VtqvJwBR9Kfr1oT$JeP%Y>Q4mDx{ zWToQucuMqB%!Ls}+?xU@Px*>6f>fSkQA&>Hhs=G3#-@`U7y%wpS+tA^kX%8$Xbyux zVS>9HN!|&!L5{#T@uD>4$$f}a353)bY8buss^GDq16}`+4$LZ$|L2hdRgbg- zeU50y2=uPMPrFm4;dh6xDGe{qLeP7l$dJdiw$REt?m=50R$*~XSj-@xJ%fq?o(evk zu1t78Dc0dJ#T`=9fQVVkl=-Sy-wfOkrlv=^%#*lT-ujM&*mM$UIf6>e}#!wSqM2?5Qy7#UP2$B;-b@=#{Iven`mVv4;aC#ot& zMnKr2Gm0t;9W9*cs&M82(`6tt6C8m=&=gh1qvXtY#$~`Xa2r3rR^tSgOi*JMTQ&66 z`4*a!WS#|8sZcN5aE~$&k)AmyNl}2ZRse3pT9E@zFe>O=D&3|scZqz+5F9?WRb{{2}`RfJb)MfmL76(QKzF0Z6#Cnp=u-a>V6 zp>=q1z7idjcc=H$rDkn7|E;*u_UJ|g!DC~N;IhZ@@B&McEp+(FKB$Fi&XSS z^-4bo(Uc}Bqx*6(I`M<}B8WCBH$+62@&Ua-YRo>2EujmaOr1Ong{zVChcc;CN;X7a zQd9^RDd$m1mb?)TMz9Rdw#*1^+Hsg!CvXS5<`|(wOI$2p1Fk360N6`UpgPP*1jXDa z!jrG^UJ180b8azyDd~|c@KJ)&;F0MIxg_qcLORF&SQuWy*H&ZAId|5dYP+)Z#PBb>-FAEP=i5DI z08gNzDbqOCvGpF*DjX0Me&Szx?!K4A76Bma(8v1lwQArKD@!%-7nDLXZ#GA_l?@+6xMrtg0&Lnt)!K0 z%@2+nbE~DL{i!VL=gWg;yxcvTtrpfBP|y2>n9T==pn2uJV?O8Fwf+P3hT)mY;rq4u z4qsDiUDSe(^;*ZpTJud;Fq6_IWxumh^T*n#EThZPONJqE27>5Cw~0Vpha-`y3yeZN z5D10l&16hu>df4stC8E3ND)t_*wTvwBqmpL7?CD+CtZJZ!k3_(gqO zI?W~FC*@(FE>=lq#czSq33Sx1g@+|{!upa`VVZm-d=oQH<;({1FiVb`u~lhA;Ek9m zc1E5>r%I0PiP3`np(W%tzLtEX)b05?4k?4{1$;pWV4k=fik>hTgj=vXRSdq0w@d#H zK{f!DArY_g#kgb&nv(WW-3M$^V1gVL8z#LFKO_02Ot_;yC11$bkwbFl4^l>GZSX%M z?8xfil1V>AK5FFfH!O&F7S;|)smV%P1Q*ZR)29F-u4=k4Jks~hlCWw_Uk251LlFx0 z0TppcWhtjfzcuwPxFnw7z7eL7Y_Qkr_9r_`-sV~b8NtITO=iy80IO{l^k_pgx_O&amAj2Z=W9%nVMD{b-& zho_T|JE73E7BI}ZO7{gYGpjMWC!o22T(nGDqUnt)2kOB99=AX-1N2%_<_bqbXbPhT z9y^t}80Z)@rVK97=(M%?fGWKSReIwss`QX|$I>?@58gNQwZq$isD1gjqqwFXU6-CT z{!mow+FJnYxF)S$^Kjf&CNz$O3D|1bn=(h1uqfOs6o4Q}7CE91hZP}{1OxgJ zQ#NF_1in)uM&U*75+p6pikDoeBT8^`B)eRpzy zdFNs{4d7AjKHpTxJX=Vl%oHnq>RqqZEnr54nBn_1BpSY^WG-^Y&(|^+ch`OGtuh}y z-2F85U1@^RsrvK-%B5?ce>qtGH$;5pTSoIQG-giUq2I6L)XUTlMbD`gI>@hff^jFT z5stfVsVBmtNjhcgo0W-$LAY0+?(Vz!v>(lv4|^L)qsShMrG7)P)Nd-bTP?MUPm15a zj8r39R!cp6zn1z6Wm_QqFVIV^$1ZqZd?~X;s*h-Y_^pC~sq>HpibI1b=Wq1NCrGH#Fyj_e@cHS!)ADl-vr6xE%6yb zG-wTJ3nfQ|_gcr^{QrvwM4IbH}HL!#l+lv-W?y(7Q%~(v;r);>g6+ zqw6AZU3;mb81`%8qlnAnCjPD~Zs~(p_3k^g-hGS2VS2vk*kFPmNjrN+wT2^4grYK> zEsaGGaWL;pxCT6-A>bqSsYVQ;ogRqJNJuI%+;A)7W)C$qEggY zBX(EZJ=~m}>yf5b_hwu`_a#9TR8YBU42sGiOj-(}F)f01Kon(4Bw@*Ye-Y=<+^cJ{G%i!8-bR=Ur zOk&KaYY-<%;gAm{mX-l^6ka+^3DlAdT1?d>L6qCc)`V&SoB%pGG|VzVgHmml*_?9H zL1f?-1(H?zbnbzRgusnM3ysERAcceur^<^er`%4BaoG^|(weM3Uq*A#2bdv?>95x% zy3Y3qlXJBg{k=Q|$MZgU<>orgLaVVkH(#%|%!}Gd=WM>eyB4Gy_R>sPz)uZbS@XLQ)r!>m(YvV;MWq>c%jmvXIxO#<#!cW{DuFlS@ zR&Vzpp3nOm%XY^)Z%-`ub{3Y%Kn|Sqh2eeF-d@lp|JP7%=SYW5CAUffRlvGT2Or>- zoyWDS&;jfD1+o~&GrEc%@J))E`q?qLrN*E(Mxpp$l&FOk2~OvH8oUUH8idYFuzwgw z_O%8=Ms*~bpvjnxg8K9=E7CxXX(3v1;$5^{kUdlJLa79)q9N`9Z!Q%**iHIVIX=ow zxM&99U3hNJGzwBjk;P8%^8`XrJ%5qb6OYg?K&#{m4A-D4DdPic9LkdEF>>Cs#$U|A zr`;N6Rnk7*O|69CI|k7cC8hDsRzxl_m_jxRk9YYWMV-A0!Y~wt5yc-IeF-OrzJe~& zMf3qeThbC~8_;%ibj#*leOJFDf?EkGP14+)+;jhcF7ny~m#CG}Mle)DsbGV~eumFJ zvdPX?WVH&V$s+}&5~c;F^F(GoCPzL)v=Q#TBxZ#K;=9xY;y~c~jN=~y2uM*i8L6WX zFJP#l3615FC!$71Jb)BWPc?`R0-|*Wa(fVcd_Iz6OYVDm4fTiLfTqJJk#eoL z3pi>LE307j@6;8@%EH5@$Sve&CK=PlRdMVG*TyQi(7QUc-qN~?ZR@Kc#Qtt+u~!#= zvzI+=n`Vvgz2({U%?8P{w7QSw%{TP5e3It>$7OLj#I6Xm&(n&6^{$IUs4bt4PXm6- F-3QAr3*rC( literal 0 HcmV?d00001 diff --git a/tests/python/data/compare_proteins/BGC0001850.pickle b/tests/python/data/compare_proteins/BGC0001850.pickle new file mode 100644 index 0000000000000000000000000000000000000000..123e81af9afee09436a72506859920d8c883b5ba GIT binary patch literal 19983 zcmb`P32-!NdEfWmZ%?jWHr$^%)@+P+N4G?P-PZJ}KGc`G)v~?AXlA6D(Hznox@@X! zT%qE!Do};&5FChIcCgDy>{L-svB4qW1S$|nT;;?B7bKLakcy2fp-d%Ug30gg?vX|+ zedAq-YWGvmOi$0#@AEwW=l}fQXY@07|KeTFZRyYHEBoC_quko5b!ty;mHV|PFCL$r z-XEkdYP@*%{<9~3_Uv6}Z$5qLUavc-H9DnAx4l>HHTvDo*-vtAx!@{5aRAJXo&wi?~D(;HIursM4Lj=26qT(s61l)78ZT4iunzw`92 z7G_+%X#M`vyRB|z*uQvd{YCYMPH$}2%7bAKE8ctdW=^_S%^qJm?a|XahmGpl>CM^s zT7SmdayB*(jFY9-hQHaAcHFh~fgJAI&bG^q7D77M6I6J)$BZvLcpuBT-`l4XatL>@WeraZYd4?5upjzu!dX2q7gVj2H zQ&=1LH@C~}M(gy&SF$_*k{9=v#>yC+U)bn@Hj57H-oWOO||I;hu5?MAiQ zs+GFir7|Aw3>rJU>!qWsI3;07qr5Jv6o~?eL zpW%DXw(74WEJfVl{737rIlVbJY45hys-+cg?xeV}y0q5VP`Z|O-1KIxWPK=~)n7OA z;_?Dstp7;n!1^1mxNWv@R)5p!1F3uVx_xrvtp1};{mnmlR{xt${l}d8Tb%loQ-9K_ zKjqX5PJP;`&p7p2r#@%LZ>rDRpL(LcVBUVMjdQEhmBsDWep}se8}rj`J17+jt%U`3 z>98WLv^U0VoSvR!<3eiV%T!68+G=#FEa}ZWJD<0$zqW#{&#cxLZoQ<|>h4KRT9`_9 z-Qj$>CCkTC?ZV2|Mshe8m|NY=adSmt+;T}5=f#pf8u;!?u{etV7i#dc>lpki@g#$n z3u|?$6?yyfD?MNBM00amW1uN}wY{WtvZ?eg3|?4XnqctKa=N}RDOYNP<5n68+qK>c ziG-tjG4|K*pJZ(5=s;I@ruFIVx!L&0t!#$V?vl4vIa(-G>cLU{!q~;>*$KujFJ{3| zt{k_fie2eu|F}a!4_=75-*7PZMeCEyo$JKyZEtFOv8@&QE3@Y2esOwhX}Y)8crEp#0ys4K?XocS2aSr?&tmz<=Wm&Eg@XmF+CEsW)z-Cz#?HV@ zru^`z)^4o1Yf}wvdCctT!lYH6PtAT}cB)eDZFP^1rz(y9TljyfJp?YxgWAo?ai!Je zw%RW&RGxV|ckq#wNo&8m-Y70Fc5G*MZ+S!WPC7?Mr7LeV6SY}s*4D3Tdc3 z(HT}+wQ?^jL34`LbKC>!H(MtnKa%@&qGqb+nG`&52XKH+17bgVC(o$;e4}suQcVTIA@1&)jpK;{g`XHW*_PTLtz76I! zc6x`Epy3?Pbu2jSIpru!GIn_`jp(Ol-ZDE?ZKTUOHFkV-yjAO*bVW@6V2l3!r$jK% z-g42mcb!33+b;GzZ+j)!?FakSMXh--?^he1zGRfg?_*}dqAyRU_wkBAJ}0@|BjUPO zs}3tKs2)A{2o@hdc1w#T$&K3U2bFkpknA2E`bPyNT9}(!TTutrlD<|Qvv_)blEq7@ z#g7rxuKw2jVWR_3z{Fpuy?+*C@27u!Qm{;efW|o5HVY58WQ3?~+c_uY_hIO8**7jia0*-5YjjUEamBsZBMFduEayOrS z{VhXgs73^rB~>r(Pc@DU!|qaFZ*0vk`L*I z(#8R2dSaq3r1$Vc^$%}f#n!JmW6|@z(;ET;tGMxu2K>>qOCL4*_UM&VPpH4H{!-Dj zHHX#pMzgRovv@L3ciVH7Eo&-v5>u6rOVw5R?DQd_P?RdIVV?$6G{T~>>Yv^{D1$6# zPW@~3|3)KNe@*?q(;Gb3r5`>PpME18JxD(=RTyrhM4{F%(PZAkb!pocSmFRC47tPRbhae|`ocf`8B4^MyS^ah_X ztaU22vyF#uzwI{Ll9eb7B`L8?-;gBPcg25}Bw4<0>58vMb`ZI$5?FGgTbi#JdSc3o zZ$z?bM7rnunkE%BC3J#7O;pP=B}-GoNRND7^P||%G}H2BH*s{!iflomg(@m&Id7ZLPJ z+v2Awys-LtqNn+y^P?AwGe+Ukd`Wix&;NaXKG=NerPp4@BlFJYBjVSeHy;(hUeSEH z`1Rf9WBfYz{Ce{hJRJGlF~dmsT=tDXQe4;cEm@K;d~OM!%Z~85ui>+zjmsrm9!0vs zQ<{a#4JVXRm-_)Om!r_qJzP#0DVm5WH_`RTviT%K)*?SPB-s!?4+39zEk3a*E0*U9 zpDU^#5)`uU2aytazN3Zs+`#9G8!46{tG*Z7@yO?{iqEYm^||8Ua|165pYySrruhDK zKKE0fOZZ&39Y-<(HBjY1&V2q0SA70ne11B?=k3j%YPhvH93CBZw~pJhHMtpCJ9ER? zje0K*s!L~?&mT{HZvXi0xznHhYU0do}dU}*P{poKCFXQIS>CXya`1}_#r=P!; zI{o(4>Gtbx&%FM4>UDeb_RDVnh0N_`d~dsYAi0_m=t0OYR-IpwuQ-7hx)Mni2R@#P zbX_+{7?tQyEXA=oC*%pu02snJkmMk+B-@tBZ^I|6q9BU75SOqNS8*jtQ6=Ax93`jt4lxTu=tN`aLqs(_`)=My~3N*s@*222#oB^RUmPh$Ye`E0lf9Rb)5P z0?YHQ#E=ci4mDy>N--}`C0BBM!{P*4buHUff+V!$P}BWLK$@nKFbdIX$-3ka>6YsH zYN(mAmPOL>l}LIQk#sU4lD4XqLb%XM+;FyC_70lIbz{(6oZX#Oys71rqrE(m9?8(- zr}98bD+Ym)qR02n11UYa4n5wMg3a~>^eAU>^k{}2hZkXV4dQL=tTmd3H?OSry!L^$ z(Krs0gM;o;;!lTD+d(0>`NfxQ*7DF!ZT>&!Hm66|*{uCcmMqsG-aQ%O?PfMVlG*%I z7dDGpcbta)>3eL+aYIcp4asst*|j~w2ekJj)d`3KMKVm!Qf1eZh$Sm6Tnb&HJ1vd+rW!P9hHG1>JBj-`$b~Taa}D8eJKl$ zvn#>z0fOTfCj>_;>TJndCp-GK-Hq+i+(J0LxjnVN+TEBo8f|GdU+6osh5mbaa9HCD z{jKwbPLDE_c`iRALm4X(5A)LU^mF0?k5ZJmBU|YAXA6C2w$SgtywGRaLjQ>!nLxdv z>VY3PBL5wh2-FyFj6Hv2*601=ShH+Wa&<174=wS=_A+C0?G+b2a4`!)FUk{ z1_sqfa%BhK$&O)4hLV6BzD3=ULMibK&GNj^41hv4l5JVFY||viaerPbe&>o4KZ+Co z*#sxLrM0$FTe=yPJQggZ2(PlQMW@P->S- z=BLH7jnF@}H06#LHlMF$mf)9cK7TEZjp;1Ge>SuE$!wjEE^HP^`a4;I|EXdAOsb>9Q0V0r(lYNoYds!YBy@ zXgX13O12jWLO~)HqaZM}$Vos3S5qVvC^1DoN8pF8`L0DP05`A|#fW?83g)LeY(HBv+hb^yIG%K-Sb_i?Re9Nb~|_Gw^C=?_J|ZFkB?>g ze?O1F^ys<>+{nG3WqO+SBe>2;p_ydf{8s8s+ZyA9*QB~C*-me3ur_OMAM|U(g_-y` zj`tR~WplnIOH1|K>ie%)-GAd)4F2Z1)#=f7x!%8-2HtfH1pm%wzRG4Zl_UQT^?>Y*5<+1c2BJ@mdxpXW5LXAp3iKq zhgqVC`tmQEG@A5~-aO|(naV$N$4t%*{9Tb$xj` zk%qy#*OL~i?Ulw+HAlePGX(s0?zwkX#xmr;oJU)FlttSkc}`@Wd#9E0nJn5yucgs; zZ^n`TN9MUZG6ekcWzW4etMo^y(8fY|zG-Tal0?2n!J{DhW?=Y^fpP`K6wniBPL=@H$P4wn;*1~iE zRdPKmwh;3}S%Y$h2Cl`??Kwr`>*pl+XPNt=S1( zL90TA7|){X53fYm7m2R_I3c=>#S`C(OUlCV=wLl4nCs@ez8>gni`HCYkmyMsUB8+} zm+g#5`_YHT3hduK53euX3K+JcF)%v^jWoP&Wa0JeS$O?M8eVpBGHHK)Rts~*ri1;u zJMyf!h1nm!mD!zN%d#lVipyqyEo1G!ow0V?zdbYjRd-xYj6cj0;|+IU2GmoRiEes0 zO0y5pNW_k$=rB=VP)`bfXch&PEf#&(4N<7ms;;@TmJ+^BnS}=W5pY^0>D2O~h&?VA z!U#wS0+M5*P3Clh+LiTC(L8irga61A%Xf8Qg0J~N7b26Z#7c;|qw$caRtDHa4YUa{w;F&Hzwq8!!{ZR)3b3qZ1o0a zSejdJtxnJMD#`lNq8HVt@*KV^s}LiFt^U8qQuyDVr|@UbbK#lWFY%H3|4I`e>tN4c z8|NdRyj0u&zbI_?WH$drp1Rl z*HWAF9pzuXWb=nFY!+PgBN?!Zy>c5B0CX`#2d`<+P7)e)XD1etC~{l&VW+?${M9o- z2ewB7&QJkz-sHniggPgOk>Yts4;HF>1S3Hx0rHfzEXbZ5vs0i-ybv-ZfWQYPMvia8 zmjrD!#770J}_kS8p$)xea1EGTw_ppkJX9f?ARyhJjy5A0y(ymsdbYVCqr^$DnD9MZ1JZr#&5 zrBZjP!7$3|N=MtBu{!QlT;I$i^T`YW{y6vYv!5Burt7wl(}d@rJTjksPC&*aHR883 zFF%!K)0ZxLc_-^gzJg+7BA24B!v9z&R>w6IH=%Oj$#hSL9okrsM*N9n6X~+;Aq`m+ zOSXkkim0K!BVM8+73nfG2&j~V5~Y@PKqwYe*_72-R3;CVfN2!P2-zY+Q_y@l7H*9F zB#>094CkU}`I4^tGL?#fgealL5i(RY!buGr*-%0PDfOMqzO6VHjx7@xez7Qau0zR$ z%jTPN@4n){_u#*uo8Z5>%E4fx199z<(>w0%T`RhFRFH#TM|2(H)J7W~=Iuwuh^^LiBYh|?(s>Ow7dw*rv z-Li{XtAE&AIvlBBM;Uee=@H|t(#XoY zGk|Fe<|s5!I9r4_J=b;7Gc9@^_#Lt!)P?71O7t*Bs6ih>`%whdfGDCqLc!>%5(FRA zCXKlQW{coRqUrGHVLf1nb5WG&fapLFr{HajEWlf2B_vovL2V3$VAO1l@g+y4|M3H8 znGe58n0^V+R|!goS)`CoSxVxNK29fH{7^;3V!DY*Du$Odz$vmF*PwYy{Dk&g6I+Xr zFd&b3HuIBT_DV@HMHwul;fjA>Y#KLxVY+cs}A3zA6M#12-#@Mxy^L@T z2*K6W#Gyn`TKHr$Qi|;NAbu#+3b2G-{Rnx&*Ib>QPu zdson402=I1K!c-WPwDRN`HM-fw5m0g&-a2<%1zMfUvYgYSJS^v9`n|(7gdwI-k!IJhe5c|AfNe&T>d`&tb*3b!}`H*B$ z5!Eb4MJ7u?A)x_D9w|WFqM-<2LhutIjW{E-_mS=e5zL~G6N6K{100MHd#IS4&QuCQ zO&B2Fsp#We))4J6+bsYg%@>n-kS&2CG~T{oK%$_F*$D~k3mLf(DIZhQ#lR%_Cz6qt zl@^r*77UM*HHN!@UsNH6MP(@wGcuHFgn6<|boG(t##|6Mu!SlpZVMpLFoQuHhCfli zcu^6s&148j4NB3R!?twO7o9l7ihE*bz+)NE*V*~;iU^MilZfCIjj*J|f>J0&rZ;$k z@(#9oLSq8gA#tz+#i6wZ5kXJDimPy4p+9gk4LYE!&{>-po$gr!g>0TN@`X}`E8rcn z=usmrNr^3eFbW!PW~nobC|M z5BC@IT)Qu0e4ol0U*+*J#`neZR7;Q2n7x7!Ti?j~fGlRel$YT&)p8B`gIT8K{nIBe z5=}&GF^kxr7iBjs3{)V4%0QuF?32+w-zuVMPyqaZ8VsJJe;^(}sKwS{>gyE5Yk zyQIP}?}x6zECPX~QJx%Cm33P}jf8QcFr|tpv5h)IsYV|WH_reojSb_is2qx;n9Me^ zSIXyb&#poXA!-cdK#ZbthcpdLNRov@hJ~dvh9n9qTp0DmRYf@@g9FS&))z5Jiy|a0 zA6b}>CYg#TsN|MtlvH1fVZNlFj}E0!o;4?eM2MsnJje$k9E2EAeRW7O#XiuvCI@i> zvvKHR#FiMzj6W0`uEbWYW~6z7i0T zUfnra&C@L(VtQMiZVPF)<%8K|TRxbb9%Ys7u{@=+f;?}HrQ0)?!_xHfK|c z7sJp&ZqhCLpl4p4yCPd?KUaw|qeri$M&F&)V&<$<3UPN-#>0J~5M zh&DA%R4|%ujN(YqV>5___Cg=QKQWW5*Z~nnZ`Tn=xTxgu4G;Qe#<2vhLSIDxB!}!Q z`Kn?jOhH4!MLj^|7DPpijKn%c9c(qhIR%Y{1_?~FbT$R082K2wF7`Kta-Y@GXRjpp z7Rgx(7)o}v&pU$&8 zJ-UtnR$rbK-!$m5fE|q}<`ek;=rV8pCs%^**(~TD5apEikdD#uk)WV=mMX>!#FrtM zfU_7Spc#TH0k2~BmD-sS5ZWt3iv?xxju|&fiij&Sps`|T7AF?@Dg##)P8P%xuxBhn zyqUs65#b5Z#GEe2WT4GQS1D-&?N`#z9TTFcsQ5?r?bck;vu@@tThAC(CiENNE zWvoJ?$rd_T>}K`w1}cETD}rc<<&KJCDus5N_ee3)5(JkOhL^aJ7t_|u>_~--u$Ts4 zumMNJE)rFba+t-ubtUFqV&0w*^KNykxTr5EM|=@ubE@S;hk;Qj7B%y5&sS##qey^G=? z_@k8u{OKogMMz*dswoF1Je>b%JuF5o*^dsH4PY~uFP|kux>zU_JtYRTia@9rLzEB_ z*x!~S7?0wh0+0tl2YXv=6-qHPo%eRyZ`37kJ6*-IKKNUDHdgW zQRn=BoZ}y5KKf8r3LhRrrE6Hnw5F{1UO8Sa*8^{+VnOw`_mAe5{G{ADnAw(ct3RGu z&7fxX9X36u5orZ~KH5bFpx0TQPno?tv-;zi)$f>Wb#1}5j~%DE60OY6SK?iHDmiNO zm&2px`b>4t@p7wQm8HX8ZnT;QdusHz&zJhvL+ay3XUC%okn=k^Gy2u3(dM(4)8S(o z{_BEtz|jFCu%BQiUa9SZPSYnafJvANr4me~`vu&AEW(`rISQHdffc~)g_b3f0?NS* zn>YxgxaiQ9GsXjB23*(yhq6=21?>PE>2+wnc#}n$0ic+y1J3v!lb|8g67&Pn(lZea z6A@@F`XOMNxgLJQ0+gf(>a!b6p<&*K{ck`GO(3mQ>_VgTbA2s#xEHR#V~LFA`=KdS zp#Kq40FZ4VUb24!$5AN{I;fb|H&|D|CJygHw7@1&jl^&t)?f+M2{Hz2;fin}(HkOM zrrchtrmX;jlUY!KRdm*C{C&M_&%`sLuvQ* zhg