diff --git a/Take an input BGC.md b/Take an input BGC.md index a57c1ab4..ad7781bc 100644 --- a/Take an input BGC.md +++ b/Take an input BGC.md @@ -11,7 +11,7 @@ - max_outdegree (int): HMM model annotations with an outdegree higher than this will be dropped - scatter (bool, optional): Choose a random subset of proteins to search that are spread across the length of the input BGC. Defaults to False. - bypass (List[str], optional): List of locus tags that will bypass filtering. This is the ID found in a GenBank file "/locus_tag=" field. Defaults to None. - - bypass_eid (List[str], optional): Less preferred than `bypass`. List of external protein IDs that will bypass filtering. This is the ID found in a GenBank file "/protein_id=" field. Defaults to None. + - protein_id_bypass_list (List[str], optional): Less preferred than `bypass`. List of external protein IDs that will bypass filtering. This is the ID found in a GenBank file "/protein_id=" field. Defaults to None. 7. Search the database for all proteins that have the same HMM model annotations as the input BGC proteins - Output from database is a data frame with columns: ['assembly_uid', 'nucleotide_uid', 'target', 'n_start', 'n_end', 'query'] 8. The initial hits output is filtered based on the following criteria: diff --git a/environment.yml b/environment.yml index cd2fdaea..b2c42bbc 100644 --- a/environment.yml +++ b/environment.yml @@ -6,7 +6,7 @@ channels: - defaults dependencies: - - conda-forge::python==3.10 + - conda-forge::python==3.12 - conda-forge::pip>=23.1.2 - conda-forge::biopython>=1.79 - conda-forge::numpy diff --git a/new_search.py b/new_search.py deleted file mode 100644 index 82ed7502..00000000 --- a/new_search.py +++ /dev/null @@ -1,88 +0,0 @@ -# The code is performing a series of operations using the SocialGene package and the SearchDomains -# class from the socialgene.search.hmmer module. Here is a breakdown of what the code is doing: - -from socialgene.base.socialgene import SocialGene -from socialgene.clustermap.serialize import SerializeToClustermap -from socialgene.search.hmmer import SearchDomains - -input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001646.gbk" -# input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001848.gbk" -hmm_dir = None -# hmm_dir = "/home/chase/Downloads/meh/socialgene_per_run/hmm_cache" -hmm_dir = "/home/chase/Downloads/para/socialgene_per_run/hmm_cache" - -a = SocialGene() -a.parse(input_gbk_path) -len(a.proteins) - -search_object = SearchDomains( - gbk_path=input_gbk_path, - hmm_dir=hmm_dir, - use_neo4j_precalc=True, - assemblies_must_have_x_matches=0.4, - nucleotide_sequences_must_have_x_matches=0.4, - gene_clusters_must_have_x_matches=0.2, - break_bgc_on_gap_of=10000, - target_bgc_padding=15000, -) -self = search_object - -search_object.outdegree_table - - -search_object.prioritize_input_proteins( - max_domains_per_protein=None, - max_outdegree=None, - max_query_proteins=None, - scatter=False, - # loci=["MicB006_2899"] - # bypass_eid=["AXA20096.1"], -) - -search_object.outdegree_table - -# TODO frac is redundant with the outdegree table -search_object.search(only_culture_collection=False, frac=0.75) - - -search_object.filter() -search_object.label_clusters() - - -search_object._bgc_regions_to_sg_object(self._primary_bgc_regions()) - -_ = self.sg_object.annotate_proteins_with_neo4j( - protein_uids=None, annotate_all=True, progress=False -) - - -df = self._primary_bgc_regions() - - -for i, row in df.iterrows(): - self.sg_object.assemblies[row["assembly_uid"]].get_locus_by_uid( - row["nucleotide_uid"] - ).add_bgcs_by_start_end( - start=row["n_start"], - end=row["n_end"], - uid=row["cluster"], - ) - - -temp = self.input_assembly.loci[self.input_bgc_id] - -temp.add_bgcs_by_feature(features=temp.features) - - -self._create_links() -self._choose_group() - - -z = SerializeToClustermap( - sg_object=self.sg_object, - bgc_order=list(self.sg_object.assemblies.keys()), - link_df=self.link_df, - group_df=self.group_df, -) - -z.write("/home/chase/Downloads/clinker-master(2)/clinker-master/clinker/plot/data.json") diff --git a/new_search2.py b/new_search2.py deleted file mode 100644 index da1a1ee9..00000000 --- a/new_search2.py +++ /dev/null @@ -1,88 +0,0 @@ -import time - -from socialgene.clustermap.serialize import SerializeToClustermap -from socialgene.search.hmmer import SearchDomains - -start_time = time.time() -# input_gbk_path = "/home/chase/Downloads/para/a/GCF_002362315.1_ASM236231v1_genomic.gbff.gz" -input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001646.gbk" -hmm_dir = None -# hmm_dir = "/home/chase/Downloads/meh/socialgene_per_run/hmm_cache" -hmm_dir = "/home/chase/Downloads/para/socialgene_per_run/hmm_cache" - - -search_object = SearchDomains( - gbk_path=input_gbk_path, - hmm_dir=hmm_dir, - use_neo4j_precalc=True, - assemblies_must_have_x_matches=0.6, - nucleotide_sequences_must_have_x_matches=0.6, - gene_clusters_must_have_x_matches=0.6, - break_bgc_on_gap_of=10000, - target_bgc_padding=15000, -) - - -# search_object.outdegree_table - - -search_object.prioritize_input_proteins( - max_domains_per_protein=2, - max_outdegree=100000, - max_query_proteins=5, - scatter=False, - # bypass_locus=["MicB006_2899"] - # bypass_pid=["AXA20096.1"], -) - -# search_object.outdegree_table - -# TODO frac is redundant with the outdegree table -search_object.search(only_culture_collection=False, frac=0.75) - - -search_object.filter() -search_object.label_clusters() - - -search_object._bgc_regions_to_sg_object(search_object._primary_bgc_regions()) - -_ = search_object.sg_object.annotate_proteins_with_neo4j( - protein_uids=None, annotate_all=True, progress=False -) - - -df = search_object._primary_bgc_regions() - - -for i, row in df.iterrows(): - search_object.sg_object.assemblies[row["assembly_uid"]].get_locus_by_uid( - row["nucleotide_uid"] - ).add_bgcs_by_start_end( - start=row["n_start"], - end=row["n_end"], - uid=row["cluster"], - ) - - -temp = search_object.input_assembly.loci[search_object.input_bgc_id] -temp.add_bgcs_by_feature(features=temp.features) - -search_object._create_links() -search_object._choose_group() - - -z = SerializeToClustermap( - sg_object=search_object.sg_object, - bgc_order=list(search_object.sg_object.assemblies.keys()), - link_df=search_object.link_df, - group_df=search_object.group_df, -) - -z.write("/home/chase/Downloads/clinker-master(2)/clinker-master/clinker/plot/data.json") -print("--- %s seconds ---" % (time.time() - start_time)) - -# search_object.rich_table(search_object.user_friendly_hit_df()) - - -# self = search_object diff --git a/pyproject.toml b/pyproject.toml index aa4f907f..280bf8b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ sg_get_goterms = "socialgene.utils.goterms:main" # search sg_mm_create = "socialgene.mmseqs.create_database:main" sg_mm_search = "socialgene.mmseqs.search:main" +sg_search_gc = "socialgene.cli.search.gene_cluster:main" # Modify database sgdb_import_classyfire = "socialgene.dbmodifiers.classyfire.import:main" diff --git a/socialgene/base/compare_protein.py b/socialgene/base/compare_protein.py index 3b981974..a9fc9f85 100644 --- a/socialgene/base/compare_protein.py +++ b/socialgene/base/compare_protein.py @@ -3,7 +3,7 @@ import pandas as pd -from socialgene.compare_proteins.hmm.scoring import mod_score +from socialgene.compare_proteins.hmm_scoring import mod_score from socialgene.neo4j.neo4j import Neo4jQuery from socialgene.utils.logging import log diff --git a/socialgene/base/molbio.py b/socialgene/base/molbio.py index 089d8db8..2fa97509 100644 --- a/socialgene/base/molbio.py +++ b/socialgene/base/molbio.py @@ -1,6 +1,11 @@ from collections import OrderedDict from uuid import uuid4 +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqFeature import FeatureLocation, SeqFeature +from Bio.SeqRecord import SeqRecord + import socialgene.hashing.hashing as hasher from socialgene.config import env_vars from socialgene.neo4j.neo4j import GraphDriver @@ -406,6 +411,14 @@ def __init__( self.external_id = external_id self.domains = domains if domains is not None else set() + def __eq__(self, other): # pragma: no cover + if not isinstance(other, type(self)): + return NotImplemented + return self.uid == other.uid + + def __hash__(self): + return hash(self.uid) + def all_attributes(self): return { s: getattr(self, s) for s in sorted(self.__slots__) if hasattr(self, s) @@ -438,6 +451,21 @@ def add_domains_from_neo4j(self): except Exception: log.debug(f"Error trying to retrieve domains for {self.uid}") + def add_sequences_from_neo4j(self): + try: + with GraphDriver() as db: + for i in db.run( + """ + MATCH (p1:protein {uid: $uid}) + RETURN p1.sequence as sequence + """, + uid=str(self.uid), + ): + if x := i.value(): + self.sequence = x + except Exception: + log.debug(f"Error trying to retrieve sequences for {self.uid}") + @property def domain_list_sorted_by_mean_envelope_position(self): # (ie can't sort a set()) @@ -488,7 +516,7 @@ class Feature(Location): """Container class for describing a feature on a locus""" __slots__ = [ - "parent_object", + "parent", "uid", "external_id", "type", @@ -506,11 +534,12 @@ class Feature(Location): "frameshifted", "too_short_partial_abutting_assembly_gap", "incomplete", + "protein", ] def __init__( self, - parent_object=None, + parent=None, uid: str = None, external_id: str = None, type: str = None, @@ -570,7 +599,7 @@ def __init__( incomplete: A boolean flag indicating whether the feature is incomplete. """ super().__init__(**kwargs) - self.parent_object = parent_object + self.parent = parent self.uid = uid self.external_id = external_id self.type = type @@ -590,6 +619,21 @@ def __init__( too_short_partial_abutting_assembly_gap ) self.incomplete = incomplete + if self.feature_is_protein(): + sg = None + current_object = self + for i in range(1, 100): + if ( + f"{current_object.__class__.__module__}.{current_object.__class__.__name__}" + == "socialgene.base.socialgene.SocialGene" + ): + sg = current_object + break + else: + if hasattr(current_object, "parent"): + current_object = current_object.parent + if sg: + self.protein = Protein(uid=self.uid) def all_attributes(self): return {s: getattr(self, s) for s in sorted(self.__slots__) if hasattr(self, s)} @@ -630,13 +674,13 @@ def __lt__(self, other): class FeatureCollection: __slots__ = [ - "parent_object", + "parent", "features", ] def add_feature(self, **kwargs): """Add a feature to a locus""" - self.features.add(Feature(parent_object=self, **kwargs)) + self.features.add(Feature(parent=self, **kwargs)) def all_attributes(self): return {s: getattr(self, s) for s in sorted(self.__slots__) if hasattr(self, s)} @@ -659,12 +703,46 @@ def get_feature_by_uid(self, uid): if v.uid == uid: return v + @property + def proteins(self): + sg = None + current_object = self + for i in range(1, 100): + if ( + str(type(current_object)) + == "" + ): + sg = current_object + break + else: + current_object = current_object.parent + return { + i: sg.proteins.get(i, None) + for i in {i.uid for i in self.features} + if i in sg.proteins + } + + @property + def protein_iter(self): + for i in self.proteins.values(): + yield i + + @property + def fasta_string_defline_uid(self): + for v in self.proteins.values(): + yield f">{v.uid}\n{v.sequence}\n" + + @property + def fasta_string_defline_external_id(self): + for v in self.proteins.values(): + yield f">{v.external_id}\n{v.sequence}\n" + class Locus(FeatureCollection): """Container holding a set() of genomic features""" __slots__ = [ - "parent_object", + "parent", "features", "metadata", "external_id", @@ -672,9 +750,9 @@ class Locus(FeatureCollection): "gene_clusters", ] - def __init__(self, parent_object=None, external_id=None): + def __init__(self, parent=None, external_id=None): super().__init__() - self.parent_object = parent_object + self.parent = parent self.features = set() self.metadata = LocusAssemblyMetadata() self.external_id = external_id @@ -682,25 +760,63 @@ def __init__(self, parent_object=None, external_id=None): self.gene_clusters = list() def calc_uid(self): - return hasher.hasher(f"{self.parent_object.uid}___{self.external_id}") + return hasher.hasher(f"{self.parent.uid}___{self.external_id}") def add_bgcs_by_feature(self, features, **kwargs): if not all([isinstance(i, Feature) for i in features]): raise ValueError( f"All features must be of type Feature, not {[type(i) for i in features if not isinstance(i, Feature)]}" ) - self.gene_clusters.append(GeneCluster(features, parent_object=self, **kwargs)) + self.gene_clusters.append(GeneCluster(features, parent=self, **kwargs)) def add_bgcs_by_start_end(self, start, end, **kwargs): features = {i for i in self.features if i.start >= start and i.end <= end} if features: self.add_bgcs_by_feature(features=features, **kwargs) + def write_genbank(self, outpath, start=None, end=None): + record = SeqRecord( + Seq(""), + id=self.external_id, + name=self.external_id, + description="A GenBank file generated by SocialGene.", + dbxrefs=[f"Assembly:{self.parent.uid}"], + ) + # Add annotation + for feature in self.features_sorted_by_midpoint: + if start: + if int(feature.start) < int(start): + continue + if end: + if int(feature.end) > int(end): + continue + biofeat = SeqFeature( + FeatureLocation( + start=feature.start, + end=feature.end, + strand=feature.strand, + ), + type=feature.type, + qualifiers={ + k: v + for k, v in feature.all_attributes().items() + if v and k != "parent" + } + | {"translation": self.parent.parent.proteins[feature.uid].sequence}, + ) + record.features.append(biofeat) + record.annotations["molecule_type"] = "DNA" + SeqIO.write( + record, + outpath, + "genbank", + ) + class GeneCluster(FeatureCollection): - def __init__(self, features, parent_object=None, uid=None, tool=None, **kwargs): + def __init__(self, features, parent=None, uid=None, tool=None, **kwargs): super().__init__() - self.parent_object = parent_object + self.parent = parent self.features = features self.uid = uid # self.tool; e.g.antismash, gecco, etc @@ -708,6 +824,38 @@ def __init__(self, features, parent_object=None, uid=None, tool=None, **kwargs): # flexible attributes self.__dict__.update(kwargs) + def write_genbank(self, outpath, sg): + record = SeqRecord( + Seq(""), + id=self.parent.external_id, + name=self.parent.external_id, + description="A GenBank file generated by SocialGene.", + dbxrefs=[f"Assembly:{self.parent.uid}"], + ) + # Add annotation + for feature in self.features: + biofeat = SeqFeature( + FeatureLocation( + start=feature.start, + end=feature.end, + strand=feature.strand, + ), + type=feature.type, + qualifiers={ + k: v + for k, v in feature.all_attributes().items() + if v and k != "parent" + } + | {"translation": sg.proteins[feature.uid].sequence}, + ) + record.features.append(biofeat) + record.annotations["molecule_type"] = "DNA" + SeqIO.write( + record, + outpath, + "genbank", + ) + class Taxonomy: "Class is a reserved word so just underscore all ranks to be consistent" @@ -747,10 +895,19 @@ def __init__( class Assembly: """Container class holding a dictionary of loci (ie genes/proteins)""" - __slots__ = ["loci", "taxid", "metadata", "uid", "taxonomy", "name"] + __slots__ = [ + "parent", + "loci", + "taxid", + "metadata", + "uid", + "taxonomy", + "name", + ] - def __init__(self, uid): + def __init__(self, uid, parent=None): super().__init__() + self.parent = parent self.uid = uid self.loci = {} self.taxid = None @@ -774,7 +931,7 @@ def add_locus(self, external_id: str = None): if external_id is None: external_id = str(uuid4()) if external_id not in self.loci: - self.loci[external_id] = Locus(parent_object=self, external_id=external_id) + self.loci[external_id] = Locus(parent=self, external_id=external_id) else: log.debug(f"{external_id} already present") @@ -814,6 +971,22 @@ def get_locus_by_uid(self, uid): if v.uid == uid: return v + @property + def gene_clusters(self): + for locus in self.loci.values(): + for gene_cluster in locus.gene_clusters: + yield gene_cluster + + @property + def fasta_string_defline_uid(self): + for v in self.loci.values(): + yield v.fasta_string_defline_uid + + @property + def fasta_string_defline_external_id(self): + for v in self.loci.values(): + yield v.fasta_string_defline_external_id + class Molbio: """Class for inheriting by SocialGene()""" @@ -853,7 +1026,7 @@ def add_protein( if return_uid: return temp_protein.uid - def add_assembly(self, uid: str = None): + def add_assembly(self, uid: str = None, parent=None): """Add an assembly to a SocialGene object Args: @@ -862,6 +1035,6 @@ def add_assembly(self, uid: str = None): if uid is None: uid = str(uuid4()) if uid not in self.assemblies: - self.assemblies[uid] = Assembly(uid) + self.assemblies[uid] = Assembly(uid=uid, parent=parent) else: log.debug(f"{uid} already present") diff --git a/socialgene/base/socialgene.py b/socialgene/base/socialgene.py index 45274728..90ae4779 100644 --- a/socialgene/base/socialgene.py +++ b/socialgene/base/socialgene.py @@ -16,7 +16,6 @@ from socialgene.base.molbio import Molbio from socialgene.clustermap.serialize import SerializeToClustermap from socialgene.hmm.hmmer import HMMER -from socialgene.mmseqs.search import search as mmseqs_search from socialgene.neo4j.neo4j import GraphDriver, Neo4jQuery from socialgene.neo4j.search.basic import search_protein_hash from socialgene.parsers.hmmer_parser import HmmerParser @@ -57,6 +56,11 @@ def get_all_gene_clusters(self): for k in j.gene_clusters: yield k + @property + def protein_iter(self): + for i in self.proteins.values(): + yield i + ######################################## # Filter ######################################## @@ -148,27 +152,6 @@ def annotate_proteins_with_neo4j( self.get_protein_domains_from_db(protein_id_list=protein_id_list) return search_result - def search_with_mmseqs(self, target_database, argstring): - with tempfile.NamedTemporaryFile() as temp_path: - self.write_fasta(temp_path.name) - self.mmseqs_results = mmseqs_search( - fasta_path=temp_path.name, - target_database=target_database, - argstring=argstring, - ) - - def compare_to_another_sg_object(sg1, sg2, argstring=""): - with tempfile.TemporaryDirectory() as tmpdirname: - sg1fa_path = Path(tmpdirname, "sg1.faa") - sg2fa_path = Path(tmpdirname, "sg2.faa") - sg1.write_fasta(sg1fa_path) - sg2.write_fasta(sg2fa_path) - return mmseqs_search( - fasta_path=str(sg1fa_path), - target_database=str(sg2fa_path), - argstring=argstring, - ) - def annotate( self, use_neo4j_precalc: bool = False, neo4j_chunk_size: int = 1000, **kwargs ): @@ -217,7 +200,7 @@ def annotate_proteins_with_hmmscan( cpus = 1 # create a list of proteins as FASTA temp1 = [ - self.single_protein_to_fasta(i) + self.proteins[i].fasta_string_defline_uid for i in protein_id_list if self.proteins[i].sequence ] @@ -231,18 +214,35 @@ def annotate_proteins_with_hmmscan( input="\n".join(temp1).encode(), domtblout_path=temp_path.name, overwrite=True, + cpus=cpus, **kwargs, ) # parse the resulting domtblout file, saving results to the class proteins/domains self.parse_hmmout(temp_path.name, hmmsearch_or_hmmscan="hmmscan") + def add_sequences_from_neo4j(self): + try: + with GraphDriver() as db: + for i in db.run( + """ + MATCH (p1:protein) + WHERE p1.uid in $uid + RETURN p1.uid as uid, p1.sequence as sequence + """, + uid=[i.uid for i in self.protein_iter], + ): + if i["uid"] in self.proteins: + self.proteins[i["uid"]].sequence = i["sequence"] + except Exception: + log.debug(f"Error trying to retrieve domains for {self.uid}") + def hydrate_from_proteins(self): """Given a SocialGene object with proteins, retrieve from a running Neo4j database all locus and asssembly info for those proteins""" for result in Neo4jQuery.query_neo4j( cypher_name="retrieve_protein_locations", param=list(self.proteins.keys()), ): - self.add_assembly(result["assembly"]) + self.add_assembly(uid=result["assembly"], parent=self) for locus in result["loci"]: _ = self.assemblies[result["assembly"]].add_locus( external_id=locus["locus"] @@ -275,7 +275,7 @@ def fill_given_locus_range(self, locus_uid, start, end): raise ValueError("No assembly found in database") else: assembly_uid = assembly_uid.value() - self.add_assembly(assembly_uid) + self.add_assembly(uid=assembly_uid, parent=self) with GraphDriver() as db: res = db.run( """ @@ -401,49 +401,42 @@ def filter_proteins(self, hash_list: List): """ return ((k, v) for k, v in self.proteins.items() if k in hash_list) + @property + def fasta_string_defline_uid(self): + for v in self.proteins.values(): + yield f">{v.uid}\n{v.sequence}\n" + + @property + def fasta_string_defline_external_id(self): + for v in self.proteins.values(): + yield f">{v.external_id}\n{v.sequence}\n" + def write_fasta( self, outpath, - hash_list: List = None, external_id: bool = False, **kwargs, ): - """Write proteins to a FASTA file + """Write all proteins to a FASTA file Args: outpath (str): path of file that FASTA entries will be appended to - hash_list (List, optional): hash id of the protein(s) to export. Defaults to None. external_id (bool, optional): Write protein identifiers as the hash (True) or the original identifier (False). Defaults to False. **kwargs: see print(open_write.__doc__) """ with open_write(filepath=outpath, **kwargs) as handle: counter = 0 - if hash_list: - temp_iter = self.filter_proteins(hash_list) + if external_id: + fasta_gen = self.fasta_string_defline_external_id else: - temp_iter = self.proteins.items() - for k, v in temp_iter: - if v.sequence is not None: - counter += 1 - if external_id: - handle.writelines(v.fasta_string_defline_external_id) - else: - handle.writelines(v.fasta_string_defline_uid) + fasta_gen = self.fasta_string_defline_uid + for i in fasta_gen: + counter += 1 + handle.writelines(i) log.info(f"Wrote {str(counter)} proteins to {outpath}") - def single_protein_to_fasta(self, uid): - """Create FASTA strings - - Args: - uid (str): hash id of protein to export - - Returns: - str: fasta string - """ - return f">{self.proteins[uid].uid}\n{self.proteins[uid].sequence}" - def write_n_fasta(self, outdir, n_splits=1, **kwargs): """ The function `write_n_fasta` exports protein sequences split into multiple fasta files. @@ -457,6 +450,7 @@ def write_n_fasta(self, outdir, n_splits=1, **kwargs): than. Defaults to 1 """ + # this can be done with itertools.batched in python 3.12 def split(a, n): # https://stackoverflow.com/a/2135920 k, m = divmod(len(a), n) @@ -464,16 +458,17 @@ def split(a, n): a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n) ) - protein_list = split(list(self.proteins.keys()), n_splits) + protein_list = split( + [value for key, value in sorted(self.proteins.items(), reverse=False)], + n_splits, + ) counter = 1 for protein_group in protein_list: with open_write( Path(outdir, f"fasta_split_{counter}.faa"), **kwargs ) as handle: - for k, v in { - key: self.proteins.get(key) for key in sorted(protein_group) - }.items(): - handle.writelines(f">{k}\n{v.sequence}\n") + for i in protein_group: + handle.writelines(f">{i.uid}\n{i.sequence}\n") counter += 1 def _merge_proteins(self, sg_object): @@ -534,7 +529,7 @@ def write_clustermap_json( ): raise NotImplementedError("write_clustermap_json needs to be updated") # TODO: Add link_df to clustermap - cmap = SerializeToClustermap(bgc_order=gene_cluster_order, sg_object=self) + cmap = SerializeToClustermap(sorted_bgcs=gene_cluster_order, sg_object=self) cmap.write(outpath=outpath) def write_genbank(self, outpath): @@ -544,8 +539,8 @@ def write_genbank(self, outpath): Seq(""), id=locus.external_id, name=locus.external_id, - description="An example GenBank file generated by BioPython", - dbxrefs=[f"Assembly:{locus.parent_object.uid}"], + description="A GenBank file generated by SocialGene.", + dbxrefs=[f"Assembly:{locus.parent.uid}"], ) # Add annotation for feature in locus.features: @@ -559,7 +554,7 @@ def write_genbank(self, outpath): qualifiers={ k: v for k, v in feature.all_attributes().items() - if v and k != "parent_object" + if v and k != "parent" } | {"translation": self.proteins[feature.uid].sequence}, ) @@ -646,7 +641,7 @@ def table_locus_to_protein(self, **kwargs): for feature in temp_list: if feature.feature_is_protein(): yield ( - feature.parent_object.uid, + feature.parent.uid, feature.uid, feature.external_id, feature.locus_tag, diff --git a/socialgene/cli/search/gene_cluster.py b/socialgene/cli/search/gene_cluster.py new file mode 100644 index 00000000..3b74b08a --- /dev/null +++ b/socialgene/cli/search/gene_cluster.py @@ -0,0 +1,255 @@ +import argparse +from pathlib import Path +from typing import List + +from socialgene.clustermap.serialize import SerializeToClustermap +from socialgene.config import env_vars +from socialgene.search.hmmer import SearchDomains +from socialgene.utils.logging import log + +env_vars["NEO4J_URI"] = "bolt://localhost:7687" + +parser = argparse.ArgumentParser( + description="Search a SocialGene database for input gene clusters similar to an input gene cluster" +) + + +parser.add_argument( + "--gbk_path", + metavar="input_filepath", + type=Path, + help="Path to the query gene cluster GenBank-format file", + required=True, +) +parser.add_argument( + "--hmm_dir", + metavar="hmm_directory", + type=str, + help="Path to the directory containing HMM files created by SocialGene's Nextflow workflow", + required=True, +) +parser.add_argument( + "--assemblies_must_have_x_matches", + metavar="fraction", + type=float, + default=0.6, + help="Minimum query proteins an assembly must have (<1 == fraction of query proteins)", +) +parser.add_argument( + "--nucleotide_sequences_must_have_x_matches", + metavar="fraction", + type=float, + default=0.6, + help="Minimum query proteins a nucleotide sequence must have (<1 == fraction of query proteins)", +) +parser.add_argument( + "--gene_clusters_must_have_x_matches", + metavar="fraction", + type=float, + default=0.6, + help="Minimum query proteins a target gene cluster sequence must have (<1 == fraction of query proteins)", +) +parser.add_argument( + "--break_bgc_on_gap_of", + metavar="value", + type=int, + default=10000, + help="Split gene clusters if gap between matched proteins is greater than this value", +) +parser.add_argument( + "--target_bgc_padding", + metavar="value", + type=int, + default=2000, + help="Pull proteins x-nucleotides on either side of target gene cluster", +) +parser.add_argument( + "--max_domains_per_protein", + metavar="value", + type=int, + default=3, + help="Maximum number of domains per protein used in search; 0 for all", +) +parser.add_argument( + "--max_outdegree", + metavar="value", + type=int, + help="Maximum outdegree a query domain can have; 0 for all", +) +parser.add_argument( + "--max_query_proteins", + metavar="value", + type=int, + default=10, + help="Maximum number of query proteins to use in search; 0 for all", +) +parser.add_argument( + "--scatter", + metavar="value", + help="Pull proteins from across the width of the input gene cluster", + action=argparse.BooleanOptionalAction, + default=False, +) +parser.add_argument( + "--locus_tag_bypass_list", + metavar="value", + nargs="+", + help="List of locus tags to bypass search filter", +) +parser.add_argument( + "--protein_id_bypass_list", + metavar="value", + nargs="+", + help="List of protein IDs to bypass search filter", +) +parser.add_argument( + "--only_culture_collection", + metavar="value", + help="Only search genomes from culture collections", + action=argparse.BooleanOptionalAction, + default=False, +) +parser.add_argument( + "--frac", + metavar="fraction", + type=float, + default=0.75, + help="Fraction of domains that equals a match in the initial database scan", +) +parser.add_argument( + "--run_async", + metavar="value", + help="Run the initial database scan using asynchronously", + action=argparse.BooleanOptionalAction, + default=True, +) +parser.add_argument( + "--analyze_with", + metavar="value", + type=str, + default="hmmer", + help="Tool to use for reciprocal best hit comparison; one of 'hmmer', 'blastp', 'mmseqs2'", +) +args = parser.parse_args() + + +def search_bgc( + gbk_path: str, + hmm_dir: str = None, + use_neo4j_precalc: bool = True, + assemblies_must_have_x_matches: float = 0.9, + nucleotide_sequences_must_have_x_matches: float = 0.9, + gene_clusters_must_have_x_matches: float = 0.9, + break_bgc_on_gap_of: int = 10000, + target_bgc_padding: int = 20000, + max_domains_per_protein: float = None, + max_outdegree: int = None, + max_query_proteins: int = 5, + scatter: bool = False, + locus_tag_bypass_list: List[str] = None, + protein_id_bypass_list: List[str] = None, + only_culture_collection: bool = False, + frac: float = 0.75, + run_async: bool = False, + analyze_with: str = "hmmer", +): + log.info(f"Running search with args: {locals()}") + search_object = SearchDomains( + gbk_path=gbk_path, + hmm_dir=hmm_dir, + use_neo4j_precalc=use_neo4j_precalc, + assemblies_must_have_x_matches=assemblies_must_have_x_matches, + nucleotide_sequences_must_have_x_matches=nucleotide_sequences_must_have_x_matches, + gene_clusters_must_have_x_matches=gene_clusters_must_have_x_matches, + break_bgc_on_gap_of=break_bgc_on_gap_of, + target_bgc_padding=target_bgc_padding, + ) + search_object.outdegree_table + search_object.prioritize_input_proteins( + max_domains_per_protein=max_domains_per_protein, + max_outdegree=max_outdegree, + max_query_proteins=max_query_proteins, + scatter=scatter, + locus_tag_bypass_list=locus_tag_bypass_list, + protein_id_bypass_list=protein_id_bypass_list, + ) + search_object.outdegree_table + search_object.search( + only_culture_collection=only_culture_collection, frac=frac, run_async=run_async + ) + # filters assemblies and nucleotide seqs that fall below threshold % of query proteins + search_object.filter() + # labels clusters with a unique id based on break_bgc_on_gap_of + search_object.label_clusters() + df = search_object._primary_bgc_regions() + df["n_start"] = df["n_start"] - search_object.target_bgc_padding + df["n_end"] = df["n_end"] + search_object.target_bgc_padding + search_object._bgc_regions_to_sg_object(df) + _ = search_object.sg_object.annotate_proteins_with_neo4j( + protein_uids=None, annotate_all=True, progress=False + ) + ######## + # add bgcs as gene_clusters to the locus objects + for i, row in df.iterrows(): + search_object.sg_object.assemblies[row["assembly_uid"]].get_locus_by_uid( + row["nucleotide_uid"] + ).add_bgcs_by_start_end( + start=row["n_start"] - search_object.target_bgc_padding, + end=row["n_end"] + search_object.target_bgc_padding, + uid=row["cluster"], + ) + # add input bgc as gene_cluster to the locus objects + if analyze_with == "hmmer": + search_object.sg_object.annotate_proteins_with_neo4j(annotate_all=True) + else: + search_object.sg_object.add_sequences_from_neo4j() + search_object._create_links( + tool=analyze_with, argstring="--fast --max-hsps 1", cpus=1 + ) + search_object._choose_group() + search_object._rank_order_bgcs() + assemblies = list( + dict.fromkeys([i.parent.parent for i in search_object.sorted_bgcs]) + ) + assemblies = [search_object.input_assembly] + assemblies + z = SerializeToClustermap( + sg_object=search_object.sg_object, + sorted_bgcs=[search_object.input_bgc] + search_object.sorted_bgcs, + link_df=search_object.link_df, + group_df=search_object.group_df, + ) + z.write("/Users/chase/Documents/test/cmap/clinker/clinker/plot/data.json") + + +def main(): + args = parser.parse_args() + + if args.max_domains_per_protein == 0: + args.max_domains_per_protein = None + if args.max_outdegree == 0: + args.max_outdegree = None + if args.max_query_proteins == 0: + args.max_query_proteins = None + _ = search_bgc( + gbk_path=args.gbk_path, + hmm_dir=args.hmm_dir, + use_neo4j_precalc=True, + assemblies_must_have_x_matches=args.assemblies_must_have_x_matches, + nucleotide_sequences_must_have_x_matches=args.nucleotide_sequences_must_have_x_matches, + gene_clusters_must_have_x_matches=args.gene_clusters_must_have_x_matches, + break_bgc_on_gap_of=args.break_bgc_on_gap_of, + target_bgc_padding=args.target_bgc_padding, + max_domains_per_protein=args.max_domains_per_protein, + max_outdegree=args.max_outdegree, + max_query_proteins=args.max_query_proteins, + scatter=args.scatter, + locus_tag_bypass_list=args.locus_tag_bypass_list, + protein_id_bypass_list=args.protein_id_bypass_list, + only_culture_collection=args.only_culture_collection, + frac=args.frac, + run_async=args.run_async, + ) + + +if __name__ == "__main__": + main() diff --git a/socialgene/clustermap/serialize.py b/socialgene/clustermap/serialize.py index ca7d8f32..467527fd 100644 --- a/socialgene/clustermap/serialize.py +++ b/socialgene/clustermap/serialize.py @@ -8,10 +8,9 @@ class SerializeToClustermap: Take a sg_object and serialize all BGC object to clustermap.js format """ - def __init__(self, sg_object, bgc_order, link_df, group_df): + def __init__(self, sg_object, sorted_bgcs, link_df, group_df): self._sg_object = sg_object - self._bgc_order = bgc_order - self._input_assembly = self._sg_object.assemblies[bgc_order[0]] + self.sorted_bgcs = sorted_bgcs self._link_df = link_df self._group_df = group_df self._reset() @@ -26,9 +25,12 @@ def _flatten_list(x): def _get_uid(self, obj=None): # increment then return a uid as a string - self._uid += 1 - self._uid_dict[str(self._uid)] = obj - return str(self._uid) + if obj in self._uid_dict: + return str(self._uid_dict[obj]) + else: + self._uid += 1 + self._uid_dict[str(self._uid)] = obj + return str(self._uid) def _build(self): self._reset() @@ -39,11 +41,13 @@ def _clusters(self): return { "clusters": [ { - "uid": self._get_uid(obj=self._sg_object.assemblies[k]), - "name": self._sg_object.assemblies[k].name, - "loci": self._loci(self._sg_object.assemblies[k]), + "uid": self._get_uid(obj=assembly), + "name": assembly.uid, + "loci": self._loci([i for i in assembly.gene_clusters]), } - for k in self._bgc_order + for assembly in list( + dict.fromkeys([i.parent.parent for i in self.sorted_bgcs]) + ) ] } @@ -73,14 +77,12 @@ def _locus(self, locus_name, locus_obj): "end": max((i.end for i in locus_obj.features)), } - def _loci(self, assembly_obj): + def _loci(self, loci): # return [ # self._locus(locus_name=k, locus_obj=v) for k, v in assembly_obj.loci.items() # ] return [ - self._locus(locus_name=k, locus_obj=gc) - for k, v in assembly_obj.loci.items() - for gc in v.gene_clusters + self._locus(locus_name=gc.parent.external_id, locus_obj=gc) for gc in loci ] def _create_groups_dict(self): @@ -92,14 +94,9 @@ def _create_groups_dict(self): with keys "uid", "label", and "genes". e.g. {"groups": [{"uid": "1", "label": "group1", "genes": ["1", "2", "3"]}] """ # Look in the clustermap uid dict (feature_to_cmap_uid_dict) for the cmap uid of each feature (query & target) in each group - self._group_df["query_cmap_uid"] = self._group_df["query"].apply( - lambda x: self.feature_to_cmap_uid_dict.get(x) - ) - self._group_df["target_cmap_uid"] = self._group_df["target"].apply( - lambda x: self.feature_to_cmap_uid_dict.get(x) - ) + log.info("Creating clustermap.js links") groups = ( - self._group_df.groupby(["target_cmap_uid", "target"])["query_cmap_uid"] + self._group_df.groupby(["query_feature"])["target_feature"] .apply(list) .reset_index() ) @@ -107,10 +104,11 @@ def _create_groups_dict(self): "groups": [ { "uid": self._get_uid( - obj=f"{i.target.external_id} {i.target.description}" + obj=f"{i['query_feature'].external_id} {i['query_feature'].description}" ), - "label": f"{i.target.external_id} {i.target.description}", - "genes": i.query_cmap_uid, + "label": f"{i['query_feature'].external_id} {i['query_feature'].description}", + "genes": [self.feature_to_cmap_uid_dict[i["query_feature"]]] + + [self.feature_to_cmap_uid_dict[i] for i in i.target_feature], } for x, i in groups.iterrows() ] @@ -124,18 +122,18 @@ def _create_links_dict(self): { "uid": self._get_uid(obj=None), "target": { - "uid": self.feature_to_cmap_uid_dict[i["target"]], - "name": self.feature_to_cmap_uid_dict[i["target"]], + "uid": self.feature_to_cmap_uid_dict[i["target_feature"]], + "name": self.feature_to_cmap_uid_dict[i["target_feature"]], }, "query": { - "uid": self.feature_to_cmap_uid_dict[i["query"]], - "name": self.feature_to_cmap_uid_dict[i["query"]], + "uid": self.feature_to_cmap_uid_dict[i["query_feature"]], + "name": self.feature_to_cmap_uid_dict[i["query_feature"]], }, - "identity": i.score, + "identity": i.pident if "pident" in i else i.score, } for x, i in self._link_df.iterrows() - if i["query"] in self.feature_to_cmap_uid_dict - and i["target"] in self.feature_to_cmap_uid_dict + if i["query_feature"] in self.feature_to_cmap_uid_dict + and i["target_feature"] in self.feature_to_cmap_uid_dict ] } @@ -145,4 +143,5 @@ def write(self, outpath): json.dump( self._build(), outfile, + indent=4, ) diff --git a/socialgene/compare_proteins/hmm/__init__.py b/socialgene/compare_gene_clusters/__init__.py similarity index 100% rename from socialgene/compare_proteins/hmm/__init__.py rename to socialgene/compare_gene_clusters/__init__.py diff --git a/socialgene/compare_gene_clusters/compare_gene_clusters.py b/socialgene/compare_gene_clusters/compare_gene_clusters.py new file mode 100644 index 00000000..98763b08 --- /dev/null +++ b/socialgene/compare_gene_clusters/compare_gene_clusters.py @@ -0,0 +1,30 @@ +from types import GeneratorType + +from socialgene.compare_proteins.diamond import DiamondBlastp +from socialgene.compare_proteins.hmmer import CompareDomains +from socialgene.compare_proteins.mmseqs import MMseqsEasySearch + + +class BGCComparison: + def __init__(self, tool): + match tool: + case "blastp": + self.tool = DiamondBlastp() + case "mmseqs2": + self.tool = MMseqsEasySearch() + case "hmmer": + self.tool = CompareDomains() + case _: + raise ValueError( + f"Tool {tool} not recognised, must be one of 'blastp', 'mmseqs2', 'hmmer'" + ) + + def compare(self, bgc1, bgc2, **kwargs): + if isinstance(bgc1, GeneratorType): + bgc1 = list(bgc1) + if isinstance(bgc2, GeneratorType): + bgc2 = list(bgc2) + return self.tool.reciprocal_hits( + self.tool.compare_proteins(bgc1, bgc2, **kwargs), + self.tool.compare_proteins(bgc2, bgc1, **kwargs), + ) diff --git a/socialgene/compare_proteins/base.py b/socialgene/compare_proteins/base.py new file mode 100644 index 00000000..35b6896b --- /dev/null +++ b/socialgene/compare_proteins/base.py @@ -0,0 +1,106 @@ +from abc import ABC, abstractmethod +from typing import List + +import pandas as pd + +from socialgene.base.molbio import Protein + +BlastTab_COLUMNS = { + "query": str, + "target": str, + "pident": "Float64", + "length": "Int64", + "mismatch": "Int64", + "gapopen": "Int64", + "qstart": "Int64", + "qend": "Int64", + "sstart": "Int64", + "send": "Int64", + "evalue": "Float64", + "score": "Float64", + "qlen": "Int64", + "slen": "Int64", +} + + +class ProteinComparison(ABC): + # TODO: force subclasses to implement self.score_column and self.score_scale and self.name + + @abstractmethod + def compare_proteins(self, p1: List[Protein], p2: List[Protein]): + ... + + def reciprocal_hits( + self, + q_vs_t_df: pd.DataFrame, + t_vs_q_df: pd.DataFrame, + ): + """This function takes a dataframe and returns a dataframe with the reciprocal best hits""" + q_vs_t_df = self.best_hit_to_query(q_vs_t_df) + t_vs_q_df = self.best_hit_to_query(t_vs_q_df) + t_vs_q_df.rename(columns={"query": "target", "target": "query"}, inplace=True) + df = pd.concat([q_vs_t_df, t_vs_q_df]) + del q_vs_t_df, t_vs_q_df + df.sort_values(by=self.score_column, ascending=False, inplace=True) + # remove non reciprocal + df = df[df.duplicated(["query", "target"], keep=False)] + df = df.drop_duplicates(subset=["query", "target", self.score_column]) + # keep the reciprocal hit with the highest bitscore + # but if multiple hits have the same highest bitscore, keep them all + return df[ + df.groupby(["query", "target"])[self.score_column].transform("max") + == df[self.score_column] + ] + + def best_hit_to_query( + self, + df: pd.DataFrame, + ): + return ( + df.reset_index(inplace=False, drop=True) + .sort_values(self.score_column, ascending=False) + .drop_duplicates("query", keep="first") + .reset_index(inplace=False, drop=True) + ) + + +class HMMDataFrame(ProteinComparison): + def __init__(self, score_column="score"): + self.score_column = score_column + + +class BlastTab(ProteinComparison): + def __init__(self, score_column="score"): + self.score_column = score_column + self.score_scale = float("inf") + + # def reciprocal_hits(self, q_vs_t_df: pd.DataFrame, t_vs_q_df: pd.DataFrame): + # # Create a new column in both dataframes: normalised bitscore + # q_vs_t_df["norm_bitscore"] = q_vs_t_df.bitscore / q_vs_t_df.length + # t_vs_q_df["norm_bitscore"] = t_vs_q_df.bitscore / t_vs_q_df.length + # # Create query and subject coverage columns in both dataframes + # q_vs_t_df["qcov"] = (q_vs_t_df.length / q_vs_t_df.qlen).clip(upper=1) + # t_vs_q_df["qcov"] = (t_vs_q_df.length / t_vs_q_df.qlen).clip(upper=1) + # q_vs_t_df["scov"] = (q_vs_t_df.length / q_vs_t_df.slen).clip(upper=1) + # t_vs_q_df["scov"] = (t_vs_q_df.length / t_vs_q_df.slen).clip(upper=1) + # q_vs_t_df = self.best_hit_to_query(q_vs_t_df) + # t_vs_q_df = self.best_hit_to_query(t_vs_q_df) + # t_vs_q_df.rename(columns={"query": "target", "target": "query"}, inplace=True) + # df = pd.concat([q_vs_t_df, t_vs_q_df]) + # del q_vs_t_df, t_vs_q_df + # df.sort_values(by="bitscore", ascending=False, inplace=True) + # # remove non reciprocal + # df = df[df.duplicated(["query", "target"], keep=False)] + # # keep the reciprocal hit with the highest bitscore + # # but if multiple hits have the same highest bitscore, keep them all + # return df[ + # df.groupby(["query", "target"])["bitscore"].transform(max) == df["bitscore"] + # ] + + # def best_hit_to_query(self, df: pd.DataFrame): + # return ( + # df.reset_index(inplace=False, drop=True) + # .sort_values("bitscore", ascending=False) + # .drop_duplicates("query", keep="first") + # .reset_index(inplace=False, drop=True) + # ) diff --git a/socialgene/compare_proteins/base_class.py b/socialgene/compare_proteins/base_class.py deleted file mode 100644 index 9cb4a020..00000000 --- a/socialgene/compare_proteins/base_class.py +++ /dev/null @@ -1,42 +0,0 @@ -from abc import ABC, abstractmethod -from collections import namedtuple - - -class CompareProteinsBaseClass(ABC): - _create_tuple = namedtuple( - "standard_protein_comparison", - ( - "query", - "target", - "score", - ), - ) - - def __init__(self): - self.protein_comparisons = [] - - @property - @abstractmethod - def df(self): - ... - # return ( - # pd.DataFrame(self.protein_comparisons) - # .sort_values(by=["mod_score"], ascending=False) - # .reset_index(inplace=False, drop=True) - # ) - - @abstractmethod - def compare_one_to_one(self, p1, p2): - ... - - @abstractmethod - def compare_one_to_many(self, p1_obj, p2_obj_list): - ... - - @abstractmethod - def compare_many_to_many(self, p1_obj_list, p2_obj_list): - ... - - @abstractmethod - def compare_all_to_all(self): - ... diff --git a/socialgene/compare_proteins/diamond.py b/socialgene/compare_proteins/diamond.py new file mode 100644 index 00000000..6024151e --- /dev/null +++ b/socialgene/compare_proteins/diamond.py @@ -0,0 +1,140 @@ +import os +import tempfile +from pathlib import Path + +import pandas as pd + +from socialgene.base.molbio import Protein +from socialgene.compare_proteins.base import BlastTab, BlastTab_COLUMNS +from socialgene.utils.logging import log +from socialgene.utils.run_subprocess import run_subprocess + + +class DiamondBlastp(BlastTab): + def __init__(self): + super().__init__() + self.temp_db_path = None + self.df = None + self.name = "Diamond BLASTp" + + def make_db( + self, + fasta_string: str, + db_path=None, + threads=1, + **kwargs, + ): + db_path = Path(db_path).with_suffix(".dmnd") + # run diamond makedb on fasta_file allow passing kwargs to diamond + command_list = [ + "diamond", + "makedb", + "--db", + str(db_path), + "--threads", + str(threads), + "--verbose", + ] + mes = run_subprocess( + command_list=command_list, + input=fasta_string, + text=True, + capture_output=True, + status=False, + **kwargs, + ) + log.debug(mes) + if not Path(db_path).exists(): + raise FileNotFoundError( + f"No database from DiamondBlastp make_db found at {db_path}" + ) + + @staticmethod + def run(fasta_path, db_path=None, threads=1, argstring="", **kwargs): + # run diamond blastp on query_fasta_string against db_path + if not Path(db_path).exists(): + raise FileNotFoundError( + f"No database from Diamond makedb found at {db_path}" + ) + with tempfile.TemporaryDirectory() as tmpdirname: + outpath = os.path.join(tmpdirname, "result.m8") + command_list = [ + "diamond", + "blastp", + "--query", + str(fasta_path), + "--db", + str(db_path), + "--out", + str(outpath), + "--threads", + str(threads), + "--verbose", + "--outfmt", + "6", + "qseqid", + "sseqid", + "pident", + "length", + "mismatch", + "gapopen", + "qstart", + "qend", + "sstart", + "send", + "evalue", + "bitscore", + "qlen", + "slen", + ] + if argstring: + argstring = [str(i) for i in argstring.split()] + command_list.extend(argstring) + mes = run_subprocess( + command_list=command_list, + check=False, + shell=False, + capture_output=True, + status=False, + ) + log.debug(mes) + if not os.path.exists(outpath): + log.warning("No output from MMseqs2 search") + else: + # sorted for reproducibility + return pd.read_csv( + outpath, + sep="\t", + header=None, + names=BlastTab_COLUMNS, + dtype=BlastTab_COLUMNS, + ).sort_values(["score", "query"], ascending=False) + + def compare_proteins( + self, queries, targets, cpus=1, argstring="--ultra-sensitive --max-hsps 1" + ): + # loop through protein list and write to temporary fasta file + if isinstance(queries, Protein): + queries = [queries] + if isinstance(targets, Protein): + targets = [targets] + if not all([isinstance(i, Protein) for i in queries]): + raise TypeError("queries must be a list of Protein objects") + if not all([isinstance(i, Protein) for i in targets]): + raise TypeError("targets must be a list of Protein objects") + with tempfile.TemporaryDirectory() as tmpdirname: + query_fasta_path = Path(tmpdirname, "queries.faa") + target_dmnd_path = Path(tmpdirname, "targets.dmnd") + with open(query_fasta_path, "w") as h: + for protein in queries: + h.write(protein.fasta_string_defline_uid) + self.make_db( + fasta_string="".join([i.fasta_string_defline_uid for i in targets]), + db_path=target_dmnd_path, + ) + return self.run( + fasta_path=query_fasta_path, + db_path=target_dmnd_path, + threads=cpus, + argstring=argstring, + ) diff --git a/socialgene/compare_proteins/hmm/hmmer.py b/socialgene/compare_proteins/hmm/hmmer.py deleted file mode 100644 index 678d5117..00000000 --- a/socialgene/compare_proteins/hmm/hmmer.py +++ /dev/null @@ -1,62 +0,0 @@ -from itertools import combinations, product -from multiprocessing import Pool - -import pandas as pd - -from socialgene.compare_proteins.base_class import CompareProteinsBaseClass -from socialgene.compare_proteins.hmm.scoring import _mod_score_tupler, mod_score - - -def picklable_modscore(p1, p2): - # named tuple doesn't work in multiprocessing - return mod_score(p1, p2)._asdict() - - -class CompareDomains(CompareProteinsBaseClass): - def __init__(self): - self.protein_comparisons = set() - - @property - def mod_score_df(self): - return ( - pd.DataFrame(self.protein_comparisons) - .sort_values(by=["mod_score"], ascending=False) - .reset_index(inplace=False, drop=True) - ) - - @property - def df(self): - return self.mod_score_df.filter(["query", "target", "mod_score"]).rename( - columns={"mod_score": "score"} - ) - - def compare_one_to_one(self, p1, p2): - return mod_score(p1, p2) - - def compare_one_to_many(self, p1_obj, p2_obj_list, filter=True): - for i in p2_obj_list: - temp = mod_score(p1_obj, i) - if not filter or temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_many_to_many(self, p1_obj_list, p2_obj_list, filter=True): - for i1, i2 in product(p1_obj_list, p2_obj_list): - temp = mod_score(i1, i2) - if not filter or temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_all_to_all(self, p1_obj_list, filter=True): - for i1, i2 in combinations(p1_obj_list, 2): - temp = mod_score(i1, i2) - if not filter or temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_all_to_all_parallel(self, p1_obj_list, cpus=1, only_hits=True): - # have to use _calculate_mod_score_not_named because named tuple can't pickle "protein_comparison_modscore" - with Pool(cpus) as p: - for i in p.starmap( - picklable_modscore, - combinations(p1_obj_list, 2), - ): - if not only_hits or i["jaccard"] > 0.001: - self.protein_comparisons.add(_mod_score_tupler(**i)) diff --git a/socialgene/compare_proteins/hmm/scoring.py b/socialgene/compare_proteins/hmm_scoring.py similarity index 91% rename from socialgene/compare_proteins/hmm/scoring.py rename to socialgene/compare_proteins/hmm_scoring.py index 228c9c46..1b221a5c 100644 --- a/socialgene/compare_proteins/hmm/scoring.py +++ b/socialgene/compare_proteins/hmm_scoring.py @@ -27,7 +27,7 @@ def mod_score(p1, p2): p1 (Protein): SocialGene Protein Class p1 (Protein): SocialGene Protein Class Returns: - dict: {l1, l2, levenshtein, jaccard, mod_score}; mod_score -> 2 = Perfectly similar; otherwise (1/Levenshtein + Jaccard) + dict: {ProteinClass, ProteinClass, levenshtein, jaccard, mod_score}; mod_score -> 2 = Perfectly similar; otherwise (1/Levenshtein + Jaccard) """ # If either protein contains no HMM annotations, # return a mod score with the worst scores possible @@ -35,10 +35,10 @@ def mod_score(p1, p2): raise TypeError(f"p1 type: {type(p1)}; p2 type: {type(p2)}") length_input_list_1 = len(p1.domains) length_input_list_2 = len(p2.domains) - if p1.uid == p2.uid: + if p1 == p2: return _mod_score_tupler( - p1.uid, - p2.uid, + p1, + p2, length_input_list_1, length_input_list_2, round(0, 2), @@ -49,8 +49,8 @@ def mod_score(p1, p2): # If either protein contains no HMM annotations, # return a mod score with the worst scores possible return _mod_score_tupler( - p1.uid, - p2.uid, + p1, + p2, length_input_list_1, length_input_list_2, round(100, 2), @@ -81,8 +81,8 @@ def mod_score(p1, p2): mod_score_value = (jaccard_score * 0.5) + mod_levenshtein_score return _mod_score_tupler( - p1.uid, - p2.uid, + p1, + p2, length_input_list_1, length_input_list_2, round(mod_levenshtein_score, 2), diff --git a/socialgene/compare_proteins/hmmer.py b/socialgene/compare_proteins/hmmer.py new file mode 100644 index 00000000..c9cfd8de --- /dev/null +++ b/socialgene/compare_proteins/hmmer.py @@ -0,0 +1,68 @@ +from itertools import combinations, product +from multiprocessing import Pool + +import pandas as pd + +from socialgene.compare_proteins.base import HMMDataFrame +from socialgene.compare_proteins.hmm_scoring import _mod_score_tupler, mod_score + + +def picklable_modscore(p1, p2): + # named tuple doesn't work in multiprocessing + return mod_score(p1, p2)._asdict() + + +class CompareDomains(HMMDataFrame): + def __init__(self): + self.name = "HMMER annotation comparison with SocialGene" + self.score_column = "score" + + def compare_proteins(self, queries, targets): + return pd.DataFrame( + ( + { + "query": i.query.uid, + "target": i.target.uid, + "query_n_domains": i.query_n_domains, + "target_n_domains": i.target_n_domains, + "jaccard": i.jaccard, + "score": i.mod_score, + } + for i in self.compare_many_to_many( + queries, targets, filter_non_hits=True + ) + ), + ).drop_duplicates(subset=["query", "target"]) + + def compare_one_to_one(self, p1, p2): + protein_comparisons = set() + return self.protein_comparisons_df(protein_comparisons) + + def compare_one_to_many(self, p1_obj, p2_obj_list, filter_non_hits=True): + for i in p2_obj_list: + temp = mod_score(p1_obj, i) + if not filter_non_hits or temp.jaccard > 0: + yield temp + + def compare_many_to_many(self, p1_obj_list, p2_obj_list, filter_non_hits=True): + for i1, i2 in product(p1_obj_list, p2_obj_list): + temp = mod_score(i1, i2) + if not filter_non_hits or temp.jaccard > 0: + yield temp + + def compare_all_to_all(self, p1_obj_list, filter_non_hits=True): + for i1, i2 in combinations(p1_obj_list, 2): + temp = mod_score(i1, i2) + if not filter_non_hits or temp.jaccard > 0: + yield temp + + def compare_all_to_all_parallel(self, p1_obj_list, cpus=1, only_hits=True): + # have to use _calculate_mod_score_not_named because named tuple can't pickle "protein_comparison_modscore" + with Pool(cpus) as p: + for i in p.starmap( + picklable_modscore, + combinations(p1_obj_list, 2), + ): + if not only_hits or i["jaccard"] > 0.001: + temp = _mod_score_tupler(**i) + yield temp diff --git a/socialgene/compare_proteins/mmseqs.py b/socialgene/compare_proteins/mmseqs.py new file mode 100644 index 00000000..2cc3c60f --- /dev/null +++ b/socialgene/compare_proteins/mmseqs.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 + +import logging +import os +import tempfile +from pathlib import Path + +import pandas as pd + +from socialgene.base.molbio import Protein +from socialgene.compare_proteins.base import BlastTab, BlastTab_COLUMNS +from socialgene.utils.logging import log +from socialgene.utils.run_subprocess import run_subprocess + +# mmseqs must be present on PATH + + +class MMseqsEasySearch(BlastTab): + def __init__(self): + super().__init__() + self.temp_db_path = None + self.df = None + self.name = "MMseqs2 EasySearch" + + @staticmethod + def make_db( + fasta_string: str, + db_path=None, + **kwargs, + ): + """Run MMseqs2 createdb on an input fasta file + + Args: + fasta_path (str): Path to fasta file + target_database (str): Path to MMseqs2 database + """ + + command_list = [ + "mmseqs", + "createdb", + "stdin", + str(db_path), + ] + mes = run_subprocess( + command_list=command_list, + input=fasta_string, + text=True, + capture_output=True, + status=False, + **kwargs, + ) + log.debug(mes) + if not Path(db_path).exists(): + raise FileNotFoundError( + f"No database from mmseqs make_db found at {db_path}" + ) + + @staticmethod + def index_database(database_path): + """Run MMseqs2 createindex on an MMseqs2 database + + Args: + target_database (str): Path to MMseqs2 database + """ + with tempfile.TemporaryDirectory() as tmpdirname: + command_list = [ + "mmseqs", + "createindex", + str(database_path), + str(tmpdirname), + ] + mes = run_subprocess( + command_list=command_list, + check=False, + shell=False, + capture_output=True, + status=False, + ) + logging.debug(mes) + if not os.path.exists(f"{database_path}.idx"): + logging.warning("No output from MMseqs2 createindex") + + def run( + self, + fasta_path, + target_database, + cpus=1, + argstring="", + ): + """Search an input fasta file against a target database with external MMseqs2 program + + Args: + fasta_path (str): Path to fasta file + target_database (str): Path to MMseqs2 database + argstring (str): additonal arguments to pass to mmseqs search, as a string + = + Returns: + pandas_df: dataframe of the mmseqs search result + """ + with tempfile.TemporaryDirectory() as tmpdirname: + outpath = os.path.join(tmpdirname, "result.m8") + command_list = [ + "mmseqs", + "easy-search", + str(fasta_path), + str(target_database), + str(outpath), + str(tmpdirname), + "--format-mode", + "0", + "--format-output", + "query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen", + "--threads", + str(cpus), + ] + if argstring: + argstring = [str(i) for i in argstring.split()] + command_list.extend(argstring) + mes = run_subprocess( + command_list=command_list, + check=False, + shell=False, + capture_output=True, + status=False, + ) + logging.debug(mes) + if not os.path.exists(outpath): + logging.warning("No output from MMseqs2 search") + else: + # sorted for reproducibility + return pd.read_csv( + outpath, + sep="\t", + names=BlastTab_COLUMNS, + dtype=BlastTab_COLUMNS, + ).sort_values(["score", "query"], ascending=False) + + def compare_proteins(self, queries, targets, cpus=1, argstring="", index=False): + # loop through protein list and write to temporary fasta file + if isinstance(queries, Protein): + queries = [queries] + if isinstance(targets, Protein): + targets = [targets] + if not all([isinstance(i, Protein) for i in queries]): + raise TypeError("queries must be a list of Protein objects") + if not all([isinstance(i, Protein) for i in targets]): + raise TypeError("targets must be a list of Protein objects") + + with tempfile.TemporaryDirectory() as tmpdirname: + query_fasta_path = Path(tmpdirname, "queries.faa") + target_dmnd_path = Path(tmpdirname, "targets") + with open(query_fasta_path, "w") as h: + for protein in queries: + h.write(protein.fasta_string_defline_uid) + self.make_db( + fasta_string="".join([i.fasta_string_defline_uid for i in targets]), + db_path=target_dmnd_path, + ) + if index: + self.index_database(target_dmnd_path) + return self.run( + fasta_path=query_fasta_path, + target_database=target_dmnd_path, + cpus=cpus, + argstring=argstring, + ) diff --git a/socialgene/compare_proteins/mmseqs_WIP.py b/socialgene/compare_proteins/mmseqs_WIP.py deleted file mode 100644 index 42ff592c..00000000 --- a/socialgene/compare_proteins/mmseqs_WIP.py +++ /dev/null @@ -1,98 +0,0 @@ -# flake8: noqa -from itertools import combinations, product -from multiprocessing import Pool -from pathlib import Path - -import pandas as pd - -from socialgene.compare_proteins.base_class import CompareProteinsBaseClass -from socialgene.mmseqs.create_database import create_database -from socialgene.mmseqs.search import search -from socialgene.mmseqs.subset_database import createsubdb - -target_proteins = list(sg.proteins.values()) - -# with tempfile.TemporaryDirectory() as tmpdirname: - -tmpdirname = "/home/chase/Downloads/work" -fasta_path = Path(tmpdirname, "target_fasta.fa") -mmseqs_path = "/home/chase/Downloads/work/mmseqsdb" - -with open(fasta_path, "w") as handle: - handle.writelines( - (i.fasta_string_defline_uid for i in target_proteins if i.sequence) - ) - - -create_database(fasta_path, mmseqs_path) - -mmseqs_lookup = "/home/chase/Downloads/work/mmseqsdb.lookup" -mmseqs_output_subset_ids = "/home/chase/Downloads/work/subset_ids" -mmseqs_subset_db = "/home/chase/Downloads/work/subset_db" - -# create a subdb using protein hash ids -protids = [i.uid for i in target_proteins][0:4] -with open(mmseqs_lookup, "r") as mml: - with open(mmseqs_output_subset_ids, "w") as h: - for i in mml: - temp = i.split("\t") - if temp[1] in protids: - z = "\t" - h.writelines(f"{i.split(z)[0]}\n") - -createsubdb(olddb=mmseqs_path, newdb=mmseqs_subset_db, idfile=mmseqs_output_subset_ids) - - -a = search(fasta_path, mmseqs_subset_db) - - -class CompareDiamond(CompareProteinsBaseClass): - def __init__(self): - self.protein_comparisons = set() - - @property - def mod_score_df(self): - return ( - pd.DataFrame(self.protein_comparisons) - .sort_values(by=["mod_score"], ascending=False) - .reset_index(inplace=False, drop=True) - ) - - @property - def df(self): - return self.mod_score_df.filter(["query", "target", "mod_score"]).rename( - columns={"mod_score": "score"} - ) - - def compare_one_to_one(self, p1, p2): - return self.calculate_mod_score(p1, p2) - - def compare_one_to_many(self, p1_obj, p2_obj_list): - for i in p2_obj_list: - temp = self.calculate_mod_score(p1_obj, i) - if temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_many_to_many(self, p1_obj_list, p2_obj_list): - for i1, i2 in product(p1_obj_list, p2_obj_list): - temp = self.calculate_mod_score(i1, i2) - if temp.jaccard > 0: - self.protein_comparisons.add(temp) - - def compare_all_to_all(self, p1_obj_list): - for i1, i2 in combinations(p1_obj_list, 2): - temp = self.calculate_mod_score(i1, i2) - if temp.jaccard > 0: - temp - self.protein_comparisons.add(temp) - - def compare_all_to_all_parallel(self, p1_obj_list, cpus=1): - # have to use _calculate_mod_score_not_named because named tuple can't pickle "protein_comparison_modscore" - with Pool(cpus) as p: - for i in p.starmap( - _calculate_mod_score_not_named, - combinations(p1_obj_list, 2), - ): - # i[5] == jaccard - if i[5] > 0.001: - self.protein_comparisons.add(_create_tuple(*i)) diff --git a/socialgene/hmm/hmmer.py b/socialgene/hmm/hmmer.py index 2e402a58..2e8d784b 100644 --- a/socialgene/hmm/hmmer.py +++ b/socialgene/hmm/hmmer.py @@ -20,7 +20,7 @@ class HMMER: def __init__(self, hmm_filepath=None): self._check_hmmer_exists() - self.input_path = hmm_filepath + self.input_path = Path(hmm_filepath) self.decompressed_hmm_path = None @staticmethod diff --git a/socialgene/mmseqs/__init__.py b/socialgene/mmseqs/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/socialgene/mmseqs/create_database.py b/socialgene/mmseqs/create_database.py deleted file mode 100644 index 89032fc7..00000000 --- a/socialgene/mmseqs/create_database.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 - - -import argparse -import logging -import os - -from socialgene.utils.run_subprocess import run_subprocess - -# mmseqs must be present on PATH - - -parser = argparse.ArgumentParser( - description="Run MMseqs2 createdb on an input fasta file" -) - -parser.add_argument( - "input_fasta_path", - help="Path to fasta file", -) - -parser.add_argument( - "target_database_path", - help="Path to fasta file", -) - - -def create_database(fasta_path, database_path): - """Run MMseqs2 createdb on an input fasta file - - Args: - fasta_path (str): Path to fasta file - target_database (str): Path to MMseqs2 database - """ - - command_list = [ - "mmseqs", - "createdb", - str(fasta_path), - str(database_path), - ] - mes = run_subprocess( - command_list=command_list, check=False, shell=False, capture_output=True - ) - logging.debug(mes) - if not os.path.exists(database_path): - logging.warning("No output from MMseqs2 create_database") - - -def main(): - args = parser.parse_args() - - fasta_path = args.input_fasta_path - target_database = args.target_database_path - - if not os.path.exists(fasta_path): - raise FileExistsError(fasta_path) - - create_database(fasta_path, target_database) - - -if __name__ == "__main__": - main() diff --git a/socialgene/mmseqs/index_database.py b/socialgene/mmseqs/index_database.py deleted file mode 100644 index 39a745c0..00000000 --- a/socialgene/mmseqs/index_database.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 - - -import argparse -import logging -import os -import tempfile - -from socialgene.utils.run_subprocess import run_subprocess - -# mmseqs must be present on PATH - - -parser = argparse.ArgumentParser( - description="Run MMseqs2 createindex on an MMseqs2 database" -) - -parser.add_argument( - "target_database_path", - help="Path to fasta file", -) - - -def index_database(database_path): - """Run MMseqs2 createindex on an MMseqs2 database - - Args: - target_database (str): Path to MMseqs2 database - """ - with tempfile.TemporaryDirectory() as tmpdirname: - command_list = ["mmseqs", "createindex", database_path, tmpdirname] - mes = run_subprocess( - command_list=command_list, check=False, shell=False, capture_output=True - ) - logging.debug(mes) - if not os.path.exists(f"{database_path}.idx"): - logging.warning("No output from MMseqs2 createindex") - - -def main(): - args = parser.parse_args() - target_database = args.target_database_path - - if not os.path.exists(target_database): - raise FileExistsError(target_database) - - index_database(target_database) - - -if __name__ == "__main__": - main() diff --git a/socialgene/mmseqs/search.py b/socialgene/mmseqs/search.py deleted file mode 100644 index ea3bd8a9..00000000 --- a/socialgene/mmseqs/search.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 - - -import argparse -import logging -import os -import tempfile - -import pandas as pd - -from socialgene.utils.run_subprocess import run_subprocess - -# mmseqs must be present on PATH - - -parser = argparse.ArgumentParser( - description="Run MMseqs2 search on an input amino acid sequence" -) - -parser.add_argument( - "input_fasta_path", - help="Path to fasta file", -) - -parser.add_argument( - "target_database_path", - help="Path to fasta file", -) - -MMSEQS_OUT_COLUMNS = { - "query": str, - "target": str, - "pident": "Float32", - "alnlen": "Int32", - "mismatch": "Int32", - "gapopen": "Int32", - "qstart": "Int32", - "qend": "Int32", - "tstart": "Int32", - "tend": "Int32", - "evalue": "Float32", - "bits": "Int32", -} - - -def search(fasta_path, target_database, argstring=""): - """Search an input fasta file against a target database with external MMseqs2 program - - Args: - fasta_path (str): Path to fasta file - target_database (str): Path to MMseqs2 database - argstring (str): additonal arguments to pass to mmseqs search, as a string - = - Returns: - pandas_df: dataframe of the mmseqs search result - """ - with tempfile.TemporaryDirectory() as tmpdirname: - outpath = os.path.join(tmpdirname, "result.m8") - command_list = [ - "mmseqs", - "easy-search", - str(fasta_path), - str(target_database), - str(outpath), - str(tmpdirname), - "--format-mode", - "0", - ] - if argstring: - argstring = [str(i) for i in argstring.split()] - command_list.extend(argstring) - mes = run_subprocess( - command_list=command_list, check=False, shell=False, capture_output=True - ) - logging.info(mes) - if not os.path.exists(outpath): - logging.warning("No output from MMseqs2 search") - else: - return ( - pd.read_csv( - outpath, - sep="\t", - names=MMSEQS_OUT_COLUMNS, - dtype=MMSEQS_OUT_COLUMNS, - ) - .sort_values(["pident", "bits"], ascending=False) - .reset_index(drop=True) - ) - - -def main(): - args = parser.parse_args() - - fasta_path = args.input_fasta_path - target_database = args.target_database_path - - if not os.path.exists(fasta_path): - raise FileExistsError(fasta_path) - if not os.path.exists(target_database): - raise FileExistsError(target_database) - - print(search(fasta_path, target_database)) - - -if __name__ == "__main__": - main() diff --git a/socialgene/mmseqs/subset_database.py b/socialgene/mmseqs/subset_database.py deleted file mode 100644 index 33c56546..00000000 --- a/socialgene/mmseqs/subset_database.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - - -import argparse -import logging -import os - -from socialgene.utils.run_subprocess import run_subprocess - -# mmseqs must be present on PATH - - -parser = argparse.ArgumentParser( - description="Run MMseqs2 subsetdb on an input fasta file" -) - -parser.add_argument( - "idfile", - help="Path to idfile file", -) - -parser.add_argument( - "olddb", - help="Path to olddb", -) - -parser.add_argument( - "newdb", - help="Path to newdb", -) - - -def createsubdb(olddb, newdb, idfile): - command_list = [ - "mmseqs", - "createsubdb", - str(idfile), - str(olddb), - str(newdb), - ] - mes = run_subprocess( - command_list=command_list, check=False, shell=False, capture_output=True - ) - logging.debug(mes) - if not os.path.exists(newdb): - logging.warning("No output from MMseqs2 createsubdb") - - -def main(): - args = parser.parse_args() - createsubdb(args.olddb, args.newdb, args.idfile) - - -if __name__ == "__main__": - main() diff --git a/socialgene/parsers/fasta.py b/socialgene/parsers/fasta.py index d0f24730..b27da477 100644 --- a/socialgene/parsers/fasta.py +++ b/socialgene/parsers/fasta.py @@ -77,7 +77,7 @@ def parse_fasta_file( _open = fh.open_read(input) input_from = input with _open as handle: - self.add_assembly(uid=assembly_id) + self.add_assembly(uid=assembly_id, parent=self) self.assemblies[assembly_id].add_locus(external_id=assembly_id) record_counter = 0 for seq_record in SeqIO.parse(handle, "fasta"): diff --git a/socialgene/parsers/genbank.py b/socialgene/parsers/genbank.py index 0d301ebe..ed4a5436 100644 --- a/socialgene/parsers/genbank.py +++ b/socialgene/parsers/genbank.py @@ -51,7 +51,7 @@ def _add_assembly(self, input_path, seq_record): if not assembly_id: # use random unique id as assembly id assembly_id = str(uuid4()) - self.add_assembly(uid=assembly_id) + self.add_assembly(uid=assembly_id, parent=self) return assembly_id def _add_locus(self, seq_record, assembly_id): diff --git a/socialgene/parsers/hmmmodel.py b/socialgene/parsers/hmmmodel.py index cf70f315..da8dc6cb 100644 --- a/socialgene/parsers/hmmmodel.py +++ b/socialgene/parsers/hmmmodel.py @@ -342,7 +342,7 @@ def read(self, filepath, base_dir=None): for i, model in enumerate( self.read_model_generator(filepath=filepath, base_dir=base_dir) ): - log.debug(f"Reading model {str(i+1)} from {filepath}") + log.debug(f"Reading model {str(i + 1)} from {filepath}") model._n = self.dict_key_index self.models[self.dict_key_index] = model self.dict_key_index += 1 @@ -366,7 +366,6 @@ def read_model_generator(self, filepath, base_dir=None): # switch to model addition _add_line_contents = self.temp_model.add_model if line.startswith("//"): - log.warning(Path(base_dir).name) # yield model self.temp_model.add_model_hash() self.temp_model.find_pfam_accessions() diff --git a/socialgene/scoring/__init__.py b/socialgene/scoring/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/socialgene/search/base.py b/socialgene/search/base.py index f505f4b7..f6fe5019 100644 --- a/socialgene/search/base.py +++ b/socialgene/search/base.py @@ -12,11 +12,11 @@ TimeElapsedColumn, ) from rich.table import Table -from textdistance import levenshtein +from textdistance import jaccard, levenshtein from socialgene.base.socialgene import SocialGene from socialgene.clustermap.serialize import SerializeToClustermap -from socialgene.compare_proteins.hmm.hmmer import CompareDomains +from socialgene.compare_gene_clusters.compare_gene_clusters import BGCComparison from socialgene.neo4j.neo4j import GraphDriver from socialgene.utils.logging import log @@ -63,7 +63,7 @@ class SearchBase(ABC): break_bgc_on_gap_of (int): Number of base pairs to break a BGC on if a gap is greater than this value. sg_object (SocialGene): SocialGene object containing the database. raw_search_results_df (pd.DataFrame): Initial search results as a pandas DataFrame. - bgc_order (pd.DataFrame): Order of the BGCs in the search results as a pandas DataFrame. + sorted_bgcs (pd.DataFrame): Order of the BGCs in the search results as a pandas DataFrame. link_df (pd.DataFrame): Linkage information between BGCs in the search results as a pandas DataFrame. group_df (pd.DataFrame): Grouping information for BGCs in the search results as a pandas DataFrame. n_searched_proteins (int): Number of proteins searched in the database. @@ -100,7 +100,7 @@ def __init__( self.break_bgc_on_gap_of = break_bgc_on_gap_of self.sg_object = SocialGene() self.raw_search_results_df = pd.DataFrame() - self.bgc_order = None + self.sorted_bgcs = None self.link_df = pd.DataFrame() self.group_df = pd.DataFrame() self.n_searched_proteins = None @@ -149,21 +149,28 @@ def _modify_input_bgc_name(self): ].uid = self.modified_input_bgc_name self.input_assembly = self.sg_object.assemblies[self.modified_input_bgc_name] + def create_input_bgcs(self): + for locus in self.input_assembly.loci.values(): + locus.add_bgcs_by_feature(features=locus.features) + self.input_bgc = list(self.input_assembly.gene_clusters)[0] + def _input_bgc_info(self): log.info( - f"Input BGC has {len(self.sg_object.proteins)} proteins and/or psuedogenes" + f"Input BGC has {len(self.sg_object.proteins)} proteins and/or pseudogenes" ) def read_sg_object(self, sg_object: SocialGene): self.sg_object = sg_object self._modify_input_bgc_name() self._input_bgc_info() + self.create_input_bgcs() def read_input_bgc(self, gbk_path: str): # parse input BGC self.sg_object.parse(gbk_path) self._modify_input_bgc_name() self._input_bgc_info() + self.create_input_bgcs() def filter(self, drop_raw_search_results_df=False): if self.raw_search_results_df.empty: @@ -185,9 +192,9 @@ def filter(self, drop_raw_search_results_df=False): if self.working_search_results_df.empty: raise ValueError("No hits found after filtering at the assembly level") - def cluster(self): - # assign clusters of proteins that aren't interrupted by a gap greater than break_bgc_on_gap_of - self._label_clusters() + # def cluster(self): + # # assign clusters of proteins that aren't interrupted by a gap greater than break_bgc_on_gap_of + # self._label_clusters() def _primary_bgc_regions(self): return self._collapse_cluster( @@ -205,27 +212,49 @@ def annotate(self): def _rank_order_bgcs(self): # TODO: should be based off of create_links() output not bgc_df """Sorts assembly IDs by comparing the levenshtein distance of ordered query proteins (forward and reverse)""" - log.info("Start: Ranking BGCs") - - input_bgc_nuc_id = list(self.input_assembly.loci.keys())[0] - input_protein_order = [ - i.uid - for i in self.input_assembly.loci[ - input_bgc_nuc_id - ].features_sorted_by_midpoint - ] - temp_dict = {} - for i in self.bgc_df["assembly_uid"].unique(): - temp_list = self.bgc_df[self.bgc_df["assembly_uid"] == i]["query"].to_list() - forward = levenshtein(input_protein_order, temp_list) - temp_list.reverse() - reverse = levenshtein(input_protein_order, temp_list) - temp_dict[i] = min(forward, reverse) - self.bgc_order = [self.modified_input_bgc_name] - self.bgc_order.extend( - [k for k, v in sorted(temp_dict.items(), key=lambda item: item[1])] + + def hl(group): + a = levenshtein( + list(group.sort_values(["t_start"])["query"]), + list(group.sort_values(["q_start"], ascending=True)["query"]), + ) / len(group) + b = levenshtein( + list(group.sort_values(["t_start"])["query"]), + list(group.sort_values(["q_start"], ascending=False)["query"]), + ) / len(group) + lev = a if a < b else b + lev = 1 - lev + jac = jaccard( + list(group.sort_values(["t_start"])["query"]), + list(group.sort_values(["q_start"], ascending=False)["query"]), + ) + return (jac * 0.5) + lev + + q_obj = pd.DataFrame( + [ + { + "query_feature": i, + "q_start": i.start, + "strand": i.strand, + } + for i in list(self.input_bgc.features_sorted_by_midpoint) + ] ) - log.info("Finish: Ranking BGCs") + temp = { + k[1]: hl( + pd.merge( + q_obj, + group, + on="query_feature", + how="outer", + suffixes=("", "_delme"), + ) + ) + for k, group in self.link_df.groupby( + ["query_cluster", "target_cluster"], sort=False + ) + } + self.sorted_bgcs = sorted(temp, key=temp.get, reverse=True) def _bgc_regions_to_sg_object(self, collapsed_df): now = time.time() @@ -281,7 +310,9 @@ def _prune_links(self, df: pd.DataFrame) -> pd.DataFrame: df_to_return = pd.concat([df_to_return, temp]) return df_to_return - def _create_link_df(self, query_gene_cluster, target_gene_cluster): + def _create_link_df( + self, query_gene_cluster, target_gene_cluster, tool="blastp", **kwargs + ): """read the args, things are backward b/c context is searching the found bgc against the input bgc Args: @@ -291,38 +322,52 @@ def _create_link_df(self, query_gene_cluster, target_gene_cluster): Returns: _type_: _description_ """ - compare_domains = CompareDomains() - compare_domains.compare_many_to_many( - (self.sg_object.proteins[gc.uid] for gc in query_gene_cluster.features), - (self.sg_object.proteins[gc.uid] for gc in target_gene_cluster.features), + + comparator = BGCComparison( + tool=tool, ) - df_to_return = compare_domains.df[compare_domains.df["score"] > 0.75] + protein_comparisons_df = comparator.compare( + query_gene_cluster.protein_iter, target_gene_cluster.protein_iter, **kwargs + ) + protein_comparisons_df["query_protein"] = protein_comparisons_df["query"].apply( + lambda x: query_gene_cluster.proteins[x] + ) + protein_comparisons_df["target_protein"] = protein_comparisons_df[ + "target" + ].apply(lambda x: target_gene_cluster.proteins[x]) + # protein_comparisons_df.drop(columns=["query", "target"], inplace=True) q_obj = pd.DataFrame( [ - {"obj": i, "query": i.uid, "q_start": i.start, "strand": i.strand} + { + "query_cluster": query_gene_cluster, + "query_feature": i, + "query_protein": i.protein, + "q_start": i.start, + "strand": i.strand, + } for i in list(query_gene_cluster.features_sorted_by_midpoint) ] ) t_obj = pd.DataFrame( [ - {"obj": i, "target": i.uid, "t_start": i.start, "strand": i.strand} + { + "target_cluster": target_gene_cluster, + "target_feature": i, + "target_protein": i.protein, + "t_start": i.start, + "strand": i.strand, + } for i in list(target_gene_cluster.features_sorted_by_midpoint) ] ) - q_temp = pd.merge(df_to_return, q_obj, on="query", how="left") - temp = pd.merge(q_temp, t_obj, on="target", how="left").sort_values("t_start") - temp = temp[["obj_x", "obj_y", "score"]] - temp = temp.rename( - columns={ - "obj_x": "query", - "obj_y": "target", - } + q_temp = pd.merge(protein_comparisons_df, q_obj, on="query_protein", how="left") + return ( + pd.merge(q_temp, t_obj, on="target_protein", how="left") + .sort_values("t_start") + .rename(columns={comparator.tool.score_column: "score"}, inplace=False) ) - return temp - def _create_links( - self, - ) -> pd.DataFrame: + def _create_links(self, tool="hmmer", cutoff=None, **kwargs) -> pd.DataFrame: """ Loop through gene_cluster compare the proteins to the input BGC @@ -338,19 +383,43 @@ def _create_links( """ # Initialize an empty DataFrame to store the links log.info("Start: Creating links") - link_df = pd.DataFrame({"query": [], "target": [], "score": []}) - for k_assembly, v_assembly in self.sg_object.assemblies.items(): - for k_locus, v_locus in v_assembly.loci.items(): - for gene_cluster in v_locus.gene_clusters: + link_df = pd.DataFrame() + progress_bar = Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TextColumn("• Time elapsed "), + TimeElapsedColumn(), + transient=True, + ) + log.info( + f"Finding reciprocal best hits; protein similarity via {BGCComparison(tool=tool).tool.name}" + ) + with progress_bar as pg: + task = pg.add_task( + description="a", + total=len(list(self.sg_object.get_all_gene_clusters())), + ) + for target_gene_cluster in self.sg_object.get_all_gene_clusters(): + if target_gene_cluster != self.input_bgc: + pg.update( + task, + description=( + f"\nComparing {self.input_bgc_id} to {target_gene_cluster.parent.external_id}:{min([i.start for i in target_gene_cluster.features])}-{max([i.start for i in target_gene_cluster.features])}" + ), + ) link_df = pd.concat( [ link_df, self._create_link_df( - gene_cluster, - self.input_assembly.loci[self.input_bgc_id], + query_gene_cluster=self.input_bgc, + target_gene_cluster=target_gene_cluster, + tool=tool, ), ] ) + + pg.update(task, advance=1, description="[blue]Complete Task") self.link_df = link_df log.info(f"Finish: Creating links; {len(link_df)} links produced") if self.link_df.empty: @@ -364,6 +433,9 @@ def _choose_group( log.info("Finish: Assigning target BGC proteins to input BGC groups") log.warning("No links to group by, no groups produced") return + # df = self.link_df + # df["query"] = df["query"].apply(lambda x: x.uid) + # df["target"] = df["target"].apply(lambda x: x.uid) self.group_df = ( self.link_df.sort_values(by="score", ascending=False) .groupby(["query"], observed=False) @@ -446,13 +518,13 @@ def _sort_genes_by_start(self): def label_clusters( self, ): - """Walks through the df and for each nucleotide sequence, labels proteins within gap tolerance + """Walks through the df and for each nucleotide sequence, assigns genes/protein to a cluster, breaking on gap tolerance Args: break_bgc_on_gap_of (int, optional): Breaks a "cluster" when the diff of two proteins' starts is greater than this. Defaults to 20000. Returns: - pd.DataFrame: input df + cluster column (integers) + pd.DataFrame: input df + cluster column (integer) """ log.info( f"Grouping protein hits if less than {str(self.break_bgc_on_gap_of)} bp apart" @@ -507,7 +579,7 @@ def _filter_clusters(self, df, threshold) -> pd.Index: def write_clustermap_json(self, outpath): cmap = SerializeToClustermap( - bgc_order=self.bgc_order, sg_object=self.sg_object, link_df=self.link_df + sorted_bgcs=self.sorted_bgcs, sg_object=self.sg_object, link_df=self.link_df ) cmap.write(outpath=outpath) @@ -537,7 +609,7 @@ def user_friendly_hit_df(self, truncate_description_to_n_chars=20): "q_description": i.description, "q_external_id": i.external_id, } - for i in self.input_assembly.loci[self.input_bgc_id].features + for i in self.input_bgc.features ] ) db_res = pd.merge(db_res, query_df, on="query", how="left") diff --git a/socialgene/search/hmmer.py b/socialgene/search/hmmer.py index ee64a3c0..69bffedc 100644 --- a/socialgene/search/hmmer.py +++ b/socialgene/search/hmmer.py @@ -1,4 +1,5 @@ import asyncio +import concurrent.futures import re from math import ceil from pathlib import Path @@ -18,6 +19,7 @@ from rich.table import Table from socialgene.base.socialgene import SocialGene +from socialgene.compare_proteins.hmmer import CompareDomains from socialgene.config import env_vars from socialgene.neo4j.neo4j import GraphDriver from socialgene.search.base import SearchBase @@ -33,11 +35,51 @@ ) -async def _find_similar_protein( +def _find_similar_proteins( domain_list, frac: float = 0.75, only_culture_collection: bool = False ): """ - The function `_find_sim_protein` is an asynchronous function that queries a Neo4j graph database to + The function `_find_similar_proteins` is a synchronous function that queries a Neo4j graph database to + find similar proteins based on protein domain information. + + Args: + protein_domain_dict (Dict[List[str]]): The `protein_domain_dict` parameter is a dictionary where + the keys are protein uids and the values are lists of domain uids. + frac (float): The `frac` parameter is a float value that represents the fraction of protein + domains that need to match in order for a protein to be considered similar. By default, it is set to + 0.75, meaning that at least 75% of the protein domains need to match + + Returns: + The function `_find_sim_protein` returns a Pandas DataFrame containing the results of the query + executed in the Neo4j database. The DataFrame has columns `assembly_uid`, `nucleotide_uid`, + `target`, `n_start`, and `n_end` + """ + # TODO: move async driver to reg driver class module + # with GraphDriver() as driver: + with GraphDriver().driver.session() as driver: + res = driver.run( + f""" + WITH $domain_list AS input_protein_domains + MATCH (prot1:protein)<-[a1:ANNOTATES]-(h0:hmm) + WHERE h0.uid IN input_protein_domains + WITH input_protein_domains, prot1, count(DISTINCT(h0)) as initial_count + WHERE initial_count > size(input_protein_domains) * $frac + MATCH (n1:nucleotide)-[e1:ENCODES]->(prot1) + {'WHERE (n1)-[:ASSEMBLES_TO]->(:assembly)-[:FOUND_IN]->(:culture_collection)' if only_culture_collection else ''} + MATCH (a1:assembly)<-[:ASSEMBLES_TO]-(n1) + RETURN a1.uid as assembly_uid, n1.uid as nucleotide_uid, prot1.uid as target, e1.start as n_start, e1.end as n_end + """, + domain_list=list(domain_list), + frac=frac, + ).to_df() + return res + + +async def _find_similar_proteins_async( + domain_list, frac: float = 0.75, only_culture_collection: bool = False +): + """ + The function `_find_similar_proteins_async` is an asynchronous function that queries a Neo4j graph database to find similar proteins based on protein domain information. Args: @@ -68,7 +110,7 @@ async def _find_similar_protein( WITH input_protein_domains, prot1, count(DISTINCT(h0)) as initial_count WHERE initial_count > size(input_protein_domains) * $frac MATCH (n1:nucleotide)-[e1:ENCODES]->(prot1) - {'WHERE (n1)-[:ASSEMBLES_TO]->(:assembly)-[:FOUND_IN]->(:culture_collection)' if only_culture_collection else '' } + {'WHERE (n1)-[:ASSEMBLES_TO]->(:assembly)-[:FOUND_IN]->(:culture_collection)' if only_culture_collection else ''} MATCH (a1:assembly)<-[:ASSEMBLES_TO]-(n1) RETURN a1.uid as assembly_uid, n1.uid as nucleotide_uid, prot1.uid as target, e1.start as n_start, e1.end as n_end """, @@ -82,8 +124,10 @@ async def _find_similar_protein( sema = asyncio.BoundedSemaphore(5) -async def _find_similar_protein_multiple( - dict_of_domain_lists, frac: float = 0.75, only_culture_collection: bool = False +async def _find_similar_proteins_async_multiple( + dict_of_domain_lists, + frac: float = 0.75, + only_culture_collection: bool = False, ): # create task group # TODO: if webserver in future this could be used to control max time of search @@ -92,7 +136,7 @@ async def _find_similar_protein_multiple( # create and issue tasks tasks = { k: group.create_task( - _find_similar_protein( + _find_similar_proteins_async( domain_list=v, frac=frac, only_culture_collection=only_culture_collection, @@ -106,11 +150,47 @@ async def _find_similar_protein_multiple( return pd.concat([v.result().assign(query=k) for k, v in tasks.items()]) +def _find_similar_proteins_sync_multiple( + dict_of_domain_lists, + frac: float = 0.75, + only_culture_collection: bool = False, +): + results = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=20, + ) as executor: + futures = {} + for k, v in dict_of_domain_lists.items(): + future = executor.submit( + _find_similar_proteins, + domain_list=v, + frac=frac, + only_culture_collection=only_culture_collection, + ) + futures[future] = k + for f in concurrent.futures.as_completed(futures): + result = f.result() + result["query"] = futures[f] + results.append(result) + return pd.concat(results) + + +def run_search( + dict_of_domain_lists, frac: float = 0.75, only_culture_collection: bool = False +): + for k, v in dict_of_domain_lists.items(): + _find_similar_proteins( + domain_list=v, + frac=frac, + only_culture_collection=only_culture_collection, + ) + + def run_async_search( dict_of_domain_lists, frac: float = 0.75, only_culture_collection: bool = False ): return asyncio.run( - _find_similar_protein_multiple( + _find_similar_proteins_async_multiple( dict_of_domain_lists=dict_of_domain_lists, frac=frac, only_culture_collection=only_culture_collection, @@ -118,7 +198,17 @@ def run_async_search( ) -class SearchDomains(SearchBase): +def run_sync_search( + dict_of_domain_lists, frac: float = 0.75, only_culture_collection: bool = False +): + return _find_similar_proteins_sync_multiple( + dict_of_domain_lists=dict_of_domain_lists, + frac=frac, + only_culture_collection=only_culture_collection, + ) + + +class SearchDomains(SearchBase, CompareDomains): """ Class search for similar BGCs in a SocialGene database, using domains Args: @@ -158,7 +248,7 @@ def __init__( self._set_hmm_outdegree() self._get_outdegree_per_hmm_per_protein() - def search(self, **kwargs): + def search(self, run_async=True, **kwargs): dict_of_domain_lists = ( self.outdegree_df.groupby("protein_uid", observed=True)["hmm_uid"] .apply(list) @@ -172,13 +262,22 @@ def search(self, **kwargs): ) as progress: task = progress.add_task("Progress...", total=2) progress.update(task, advance=1) - self.raw_search_results_df = run_async_search( - dict_of_domain_lists, **kwargs - ) - self.raw_search_results_df["query"] = self.raw_search_results_df[ - "query" - ].astype("category") - progress.update(task, advance=1) + if run_async: + self.raw_search_results_df = run_async_search( + dict_of_domain_lists, **kwargs + ) + self.raw_search_results_df["query"] = self.raw_search_results_df[ + "query" + ].astype("category") + progress.update(task, advance=1) + else: + self.raw_search_results_df = run_sync_search( + dict_of_domain_lists, **kwargs + ) + self.raw_search_results_df["query"] = self.raw_search_results_df[ + "query" + ].astype("category") + progress.update(task, advance=1) log.info( f"Initial search returned {len(self.raw_search_results_df):,} proteins, found in {self.raw_search_results_df.assembly_uid.nunique():,} genomes" ) @@ -270,8 +369,8 @@ def prioritize_input_proteins( max_domains_per_protein: int = None, max_outdegree: int = None, scatter: bool = False, - bypass_locus: List[str] = None, - bypass_pid: List[str] = None, + locus_tag_bypass_list: List[str] = None, + protein_id_bypass_list: List[str] = None, ): """Rank input proteins by how many (:hmm)-[:ANNOTATES]->(:protein) relationships will have to be traversed @@ -281,27 +380,27 @@ def prioritize_input_proteins( max_domains_per_protein (int): Max domains to retain for each individual protein (highest outdegree dropped first) max_outdegree (int): HMM model annotations with an outdegree higher than this will be dropped scatter (bool, optional): Choose a random subset of proteins to search that are spread across the length of the input BGC. Defaults to False. - bypass_locus (List[str], optional): List of locus tags that will bypass filtering. This is the ID found in a GenBank file "/locus_tag=" field. Defaults to None. - bypass_eid (List[str], optional): Less preferred than `bypass`. List of external protein IDs that will bypass filtering. This is the ID found in a GenBank file "/protein_id=" field. Defaults to None. + locus_tag_bypass_list (List[str], optional): List of locus tags that will bypass filtering. This is the ID found in a GenBank file "/locus_tag=" field. Defaults to None. + protein_id_bypass_list (List[str], optional): Less preferred than `bypass`. List of external protein IDs that will bypass filtering. This is the ID found in a GenBank file "/protein_id=" field. Defaults to None. Returns: pd.DataFrame: pd.DataFrame({"external_id":[], "hmm_uid":[], "outdegree":[]}) """ log.info("Prioritizing input proteins by outdegree") loci_protein_ids = set() - if bypass_locus: + if locus_tag_bypass_list: # bypass using locus tags loci_protein_ids = { i.uid - for i in self.input_assembly.loci[self.input_bgc_id].features - if i.locus_tag in list(bypass_locus) + for i in self.input_bgc.features + if i.locus_tag in list(locus_tag_bypass_list) } - if bypass_pid: + if protein_id_bypass_list: # bypass using an external protein ids loci_protein_ids.update( { i.uid - for i in self.input_assembly.loci[self.input_bgc_id].features - if i.external_id in list(bypass_pid) + for i in self.input_bgc.features + if i.external_id in list(protein_id_bypass_list) } ) len_start = self.outdegree_df["outdegree"].sum() @@ -357,6 +456,7 @@ def prioritize_input_proteins( threshold = ceil(n_input_proteins * max_query_proteins) else: threshold = max_query_proteins + m_start = self.outdegree_df["outdegree"].sum() if scatter: temp = list( self.input_assembly.loci[ @@ -372,7 +472,6 @@ def prioritize_input_proteins( | self.outdegree_df["protein_uid"].isin(loci_protein_ids) ] else: - m_start = self.outdegree_df["outdegree"].sum() log.info( f"'max_query_proteins' is set to {threshold}, will limit search to {threshold} of {n_input_proteins} input proteins" ) @@ -449,9 +548,9 @@ def _outdegree_table_stats(self): d = [ { "protein_uid": i.uid, - "desc": f"{i.external_id} | {i.locus_tag if i.locus_tag else 'no-locus-tag'} | {i.description}", + "desc": f"{i.external_id} | {i.locus_tag if i.locus_tag else 'no-locus-tag'} | {i.description}", } - for i in self.input_assembly.loci[self.input_bgc_id].features + for i in self.input_bgc.features ] d = pd.DataFrame(d) temp = pd.merge(temp, d, on="protein_uid", how="left") diff --git a/socialgene/utils/logging.py b/socialgene/utils/logging.py index d5006066..c73840f3 100644 --- a/socialgene/utils/logging.py +++ b/socialgene/utils/logging.py @@ -1,8 +1,6 @@ import logging import sys -from socialgene.config import env_vars - # If {rich} is installed use it, otherwise.... don't try: from rich.console import Console @@ -11,7 +9,7 @@ c = Console(width=150) # https://rich.readthedocs.io/en/stable/logging.html logging.basicConfig( - level="NOTSET", + level=logging.INFO, # format="%(filename)s/%(module)s/%(funcName)s\::: %(message)s", format="%(message)s", datefmt="%Y-%m-%d %H:%M:%S", @@ -25,7 +23,7 @@ except ImportError: log = logging.getLogger(__name__) -log.setLevel(env_vars["SOCIALGENE_LOGLEVEL"]) +# log.setLevel(env_vars["SOCIALGENE_LOGLEVEL"]) # handler = logging.StreamHandler(stream=sys.stdout) # log.addHandler(handler) diff --git a/socialgene/utils/ncbi_ftp.py b/socialgene/utils/ncbi_ftp.py index 56f66d67..70487619 100644 --- a/socialgene/utils/ncbi_ftp.py +++ b/socialgene/utils/ncbi_ftp.py @@ -9,7 +9,7 @@ class NcbiFtp: def __init__(self): log.debug( - f'Connected to "{self.ftp.host}" on {self.ftp.port};\n\n NCBI FTP info: {self.ftp.welcome.replace("220","")}' + f'Connected to "{self.ftp.host}" on {self.ftp.port};\n\n NCBI FTP info: {self.ftp.welcome.replace("220", "")}' ) self.ftp.set_pasv(True) self.assembly_paths = [] diff --git a/socialgene/utils/run_subprocess.py b/socialgene/utils/run_subprocess.py index 06f43566..55b90de6 100644 --- a/socialgene/utils/run_subprocess.py +++ b/socialgene/utils/run_subprocess.py @@ -12,6 +12,7 @@ def run_subprocess( check=True, shell=False, capture_output=True, + status=False, **kwargs, ): """Run something in a separate process @@ -31,11 +32,21 @@ def run_subprocess( command_list_string = " ".join(command_list) else: command_list_string = command_list - log.info(f"Executing external program:\n{command_list_string}") - with console.status( - "", - spinner="bouncingBar", - ) as status: + if status: + log.info(f"Executing external program:\n{command_list_string}") + with console.status( + "", + spinner="bouncingBar", + ) as status: + result = subprocess.run( + command_list, + check=check, + shell=shell, + capture_output=capture_output, + **kwargs, + ) + else: + log.debug(f"Executing external program:\n{command_list_string}") result = subprocess.run( command_list, check=check, @@ -43,8 +54,7 @@ def run_subprocess( capture_output=capture_output, **kwargs, ) - if result.stderr: - log.error(f"Error code: {result.returncode}") - log.error(f"Error stdout: {result.stderr.decode('utf-8')}") - raise SystemExit - _ = status + if result.returncode != 0: + log.error(f"Error code: {result.returncode}") + log.error(f"Error stdout: {result.stderr.decode('utf-8')}") + raise SystemExit diff --git a/tests/python/classes/test_mibig_fasta_parser.py b/tests/python/classes/test_mibig_fasta_parser.py index 7b35d7d0..ca2fbbce 100644 --- a/tests/python/classes/test_mibig_fasta_parser.py +++ b/tests/python/classes/test_mibig_fasta_parser.py @@ -17,124 +17,146 @@ def test_fasta_file_parse(): sg_object = SocialGene() gbk_path = os.path.join(FIXTURE_DIR, "lagriamide_mibig_bgc0001946.gbk") sg_object.parse(gbk_path) - sg_object.write_fasta(outpath=fp.name) + sg_object.write_fasta(outpath=fp.name, external_id=True) fasta_object = SocialGene() fasta_object.parse(fp.name) protein_parse_results = { - k: [v.description, v.external_id, v.domains] + k: [v.description, v.external_id, v.domains, v.sequence] for k, v in fasta_object.proteins.items() } assert protein_parse_results == { "Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl": [ - "Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl", - "Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl", + "AXA20086.1", + "AXA20086.1", set(), + "MQEYCRLRRKLTLELSPEDAADVAQEAFERTLRYMRKHDGRVASPVGLLVRIALNLQIDRGRRRKHLPTALDESWEHPRWDITPEDEVVGRQSVTQLVETLDKLAPRRREAFVLCRLHGLTYQDAAKKMGIRPSVVREYLVDAVRACRDSVDWAVSRVKCNTPLVNGLELSRSG", ], "ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn": [ - "ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn", - "ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn", + "AXA20087.1", + "AXA20087.1", set(), + "MAPTGIILGDTNGLRISLVCNPFVSPKIMPVGAIASSRSIYATIPLAMDGPVYWILWTDNPSNR", ], "-l7xLyFZbiZENPLq_GML8JyTRF1Srawr": [ - "-l7xLyFZbiZENPLq_GML8JyTRF1Srawr", - "-l7xLyFZbiZENPLq_GML8JyTRF1Srawr", + "AXA20088.1", + "AXA20088.1", set(), + "MSEYIHKSHNVTVLTYHVVLWRGIEKQSLMTGLAKY", ], "T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0": [ - "T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0", - "T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0", + "AXA20089.1", + "AXA20089.1", set(), + "MSDLPPRFAKSAAVERRFSQTKQGCNAGFARAMPGLVSFFTPQFADMNKLIDYWRATVACGRMGEGMSERGRHTEFSVVLLRNRDSAPVFGVVLRA", ], "AStsOnOU5ZWxURs9PrTiWjddkuQXfanl": [ - "AStsOnOU5ZWxURs9PrTiWjddkuQXfanl", - "AStsOnOU5ZWxURs9PrTiWjddkuQXfanl", + "AXA20090.1", + "AXA20090.1", set(), + "MQSVNPIGETKKALRLLQELVGRELSLPADEVPVDTDYLNMGFSSRSLIGLIQQLSSLLRTKLNPSVLFDYRTLTEFADYLAEHHAQWLTAIEREQPAERNQEAGDETSVAVPLSEAQRGLWFLQKNHPWMAAYNVPLCLRLSPRVQRERMRQACAWLPRRWPVLGASVQRADGRLVMQTQPARKLTWQEHHAEDWSEAQRLAWLGDRLAEPFDVDNGPLLRAYWLGGEPDGASRLLLVIHHLVIDAVSVGVLLAGLRKTYADLEGWRDLSGAVDDSAYGAFVAAEAERLAGAEGFARLAYWREQLADVPGSLGLPLDRARGVTPSFKGCTLRRELQAALGESLRAYTERHRVYPSTLLLAVFQGLLSRHAGRDDVVVGMAMDERDAASAGLVGMFVNMLPMRARGLGRRGFVEDMQALQRQLVDAMAQAYPFPALVRELGLSGSDASPLFQAAFLYQDTLDIDVLNGVDDWVWEEALYQEGEYELVLEVRRRLKGYALYFKYDPTLWDKSTIERWLAHYLRLLEGVLAAPQKRLGEHELRGEHERARLAALQGEVRDWALTPVSILFERQAIANAQAWAVSDDQQRWCYAELAAHSAAIAQRLHVQGIGTGSIVGVCQGRSPWLLASLLGIWQAGAAYIPLDPAYPVERLRYMLEDSGASAVLSDTSHLVLVQALAGMLPVWAADAAAPSTSAPSFPPPQPEDLAYVLYTSGSTGRPKGVRISHGALSNFLQSMAEKPGLTAGDRLLAITTISFDIAGLELYLPLIGGGECVLCPEEIVRDARRLKAEVEHVRPTLMQATPATWSMLFHAGWRNAEELKVLCGGEALPARLKQRFDEIGTTVWNLYGPTETTIWSTLAKLDAEDTSIHIGQPIANTQAYVLDQEGREQPIGIEGELYLGGAGLAQGYHGQPERTAQAFIDHPLGRLYKTGDLARWRADGQLEYRGRSDQQVKVNGHRVEPGEIEAVLEQSGLVKQAAIVLREGAHGSQLAAWCVPTKVTQGDTWLDPVQIQVLQAQLRDRVPAYMQPSIWLGTAALPKTLNGKIDRQVLSARALPEQREEKAPPSAVTRSAARRALESRLQALWAQVLERSTVSRDERFMEAGGNSVAAVLLAERIQAEFGRAFGVAQVFAYTSVAAQAAYLDASDLTFEALANEPIATQVIVAEPERDASGVAEDALAIIGIACHFPGAEDHRAFWNNLRAGHDSGKLFSPEALRAAGVPERLIADPHYVPIRYGIEGKAEFDADFFNLSPRAASLMDPQYRLLLQQAWAAIEDAGYTPEVIPDTGVFMSASFSAYQARLQDPSAVEAGDRYVAWLMSQGGSLPTLISYHLGLTGPSLFIQTNCSSFLSALAAARSSLLARESRLALVGAATLFTEDSKGYLHEPGLNFSSDGHCKTFDASADGMVSGEGCGVILLKQAREAVADGDHIYALLRGIAVNNDGADKAGFYAPSVRGQSEVIEQALRQARIDPGQIGYVEAHGTGTRLGDPIEVTALSETYRRHTQATQYCAIGSVKSNLGHLDTAAGLAGCIKLALSLQHGEIPPTLHYQQPNPAIDFAASPFYVAEHLQAWPPGPRLAGLSAFGIGGTNTHAILEAYLEESVAARVDGAQLIVLSARTQERLHAVERQLLDHLDSSASLPSLRDLAYTLQVGRRGMTHRLAFVVEDVSALRRQLSDHLAGRKVGHEGDCDRGHAVLHAFENDGDSARLFAQWAAAGKLDKLAALWVQGLSLDWALLHGGQRPRRVSLPTYPFERTRHWLEVPLAASAVSPESPELATERSWDGMSYLPRWVATPATAPDVEEVPAPASLLIVAPEASARADELFEWCTRRWPSAALRLIRLGRENLWLGEAERVCDSFDDGQALTEALREGTWPDSVVFLAEEAGSTLNWPGHPESAEAQLLRVQQALSQSQPAQRIEFYLVTIEANASDALTGGGLSGLAYALAQGDHRLRLRHLSLDADVWSASSWWQLWAEPASDRGERVRFSGGERWRQRFDRLNWGSLRDGGLKQGGVYLIAGGSGTLGIAISRYLIERYRARVIWLGRSRADAPELVARRQVLEAGALLPGYVQADLTDATAVHEAVARARQIHGTIDGAIFSAMYRGADAPAERWSPTALAGAVAVKALGARHFYQALLGESLDFLCYYSSVQSFAFLSARNSAAYAVGVAAADRYVQLIRAQSPFRVGLIHWGYWQDTLAGTALEQDLARHFTGIHADEACVFFERFVAALGQGMLDQAICLKASDTVRDVMPSTVDDVVVLTTAGTPSFLDGFDAAALRSTVMPPDWTELDRRLRGLLCAQLRYLGLFDQAGVSWESEQLRRKLGVIDDYRRWWDECCTEMLEKEGWIRTRKGRVELLQALSLEQADALWQDWEHDKADYLKDPQLRAALLLVEDCLRHLPAVLLGRTSATSVIFADGSMSKVAGLYQGTAWTDSFNHQVADTVEVYVRHRLAADPLARLRVLEVGAGTGGTTAKVLPRLAAIGAPMLEYCYTDLSEAFLAQARERHLANYPYLRTCRCDIERPLAEQGIVPGSYDIVIATNCLHATHDIRATLRHVQAALRCHGVLIANEGVSKSLLGTLTFGLLEGWWLYDDPQLRIPGSPLLDSAHWRALLDEAGMRPVCLDGPGRELQQVWVAQSRGLIRPGGFASASATRLPAPSSVPAKAGVAKPVVSASPVSSSEVVPYAAIAAEIRACLAETLKREAAGLADETAFSDYGIDSILSVALVKRLNQRLGVQLGQAVIYDYSTIAALSRHVLERRDGQAAMVPRQFGVVAASESTVVEVPVLAPVTVSEAVPPAAEAARPTPIAVIGMALQVPDAEDADTFWANLLSGHDSIRELPEAYGRPAHGASPRGGALQGRDYFDAAFFGLSNEEAAGMSPAQRLVLEEGWKALEDAGYDPRSLRGSRTAVFVGAEPSGFFQGSFSGSSEAIIASRLSYLLDLKGPALVVNTGCSSGAMAIHLACESLRRGEIQLALSGGVASALSVEGLRHLADAGMLSPQGHCLSFEATGDGMVLSEAVGMLVLKRLDQAIADGDAIHGVIAASGSNQDGTSNGITAPNGRAQEDLLGEIWARHGIAPEHISHFEVHGTGTSLGDAVEGNAISRAFGRVTARRGFCVIGTSKTHIGHTGAASGVVGLIKLLLSLRHRQLPGLLHFERANPLIDWNASALRLPEATTAWEGEPDLPRYAGLNAFGHSGTNVHMVVREYGPGEGDQRGPQLLECAQEVLLPLSAASALQLARSARRLLDFLDEEAGHGQSRPSLAELAYTCQTGRSALVERAVLKAGDREQLRALLLALAEQRPMAGLWRGSVDPETTLVAAANRDGLDELAESWIRGAEVDWQKLYGPVRPRRCHLPAYPFDRRHFPWHLAASVPSPVSRHRVPVATEPIATLTTVTAMPAWRFVLAATAEAGAEPRAQATQWLCRWLAMRLQRPVQALNPQLSYRELGLTSLGLVALSEELSRLLGVVVLPSLLFEYPSIASLAAHLAEQHAALLSRVQSIPLTGADSPSVGTPAEASVPDRVLSVLQAYRNGALNHAQARTLLAETTP", ], "IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ": [ - "IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ", - "IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ", + "AXA20091.1", + "AXA20091.1", set(), + "MNAFMTLDQMIALYKAGKVSVDQVKEAFARLPDATADTGIVTDVELSETQRGLWSLQKAYPWLAAYNVPLCLRLPADLDRERFLRACAGLLERWPVLGASVEQQDGPLRLRMASVSGLSLEQDDGLAWSESERLDWLRERIEQPFDLAKGPLLRAHWLEGSGGGESLFLLVVHHLVVDGASVGLLLAGLHEAYRALGNGEAMPSASGSDGYLGFVQAERVRLQGDQAARRLAYWREQMADAPSSLGLPLDRPRSATPSFKGRTLRRELLSALGDSLNAYTEYHGVYPSTLLLAVFQGLLSRHAGCDDVVVGMAIDERDAASAGLVGMFVNMLPMRARGLGQRGFGEDVQALQRQLVDAMAQAYPFPALVRELGLSGGEASPLFQAAFLYQDMHDMLDTEALAEVSKWTWEEALYQEGEYELVLEIRRRAQGYVLYFKYDPMLWDESTIERWLAHYLHLLEGVLADPRKRLGEHELRGEHERAWLAAWQGEVRDWALTPVPALFEHQVAANAQAWAVSDDQQRWCYAELAAYGTAIAQRLHAQGIGSGSIVGVCQGRSPWLLASLLGIWRAGAAYVPLDPAYPVERLRYMLEDSGASAVLSDTLHLAQVQALAGILPVWAADAIGPLTSVPLPTLQPEDLAYVLYTSGSTGRPKGVRISHGALSNFLQSMAEQPGLMAGDRLLAITTISFDIAGLELYLPLIGGGECVLCPAEIARDGRRLKAEVERVRPTLLQATPATWSMLFHAGWHNAERLKVLCGGEALPVRLKQRFDEIGTTVWNLYGPTETTIWSTLAKLDAEDTSIHIGQPIANTQAYVLDQEGREQPIGIEGELYLGGAGLAQGYHGQPERTAQAFIDHSLGRLYKTGDLARWRADGQLEHRGRSDQQVKVNGHRVEPGEIEAVLEQSGLVKQAAIVLREGAHGSQLAAWCVPTKGTHEDTWLDPVQIQVLQAQLRDRVPAYMQPSIWLGTAVLPQTPNGKVDRRVLCARALPAQREANTQPPSATPRNAVRQELESRLQALWIEVLERPMVDRDERFMEAGGNSVAAVLLAERIQAEFGRVFGVAQVFAHTSVAAQAAYLDEVQPVSTPGETTDSSAPKPDTPETVLAEDTLAIIGIACHFPGAEDHRAFWENLRAGHDSGKLFSPEALREAGVPEPLIADPQYVPVHYGIEGKAEFDAEFFNLPPRTVTRMDPPFRLLLQHAWAAIEDAGYTPEAIPDTAVYMATGGPLRLAELTEPGSPGDSDDYVNWLLAQPGTVATMVSYQLGLRGPSYAVHANCSSSLVGMHAAAQSLRSGEVRTALVGAASLFGDDSLGYRYEPGLNFSSDGHCKPFDLRADGLVAGEGVAVVLLKRAREAVADGDHIYALLRSVAVNNDGADKAGFYAPSVRGQSEVIEQALRQACIDPDQIGYVEAHGTGTRLGDPIEVTALSETYRRHTQATQYCAIGSVKSNLGHLDTAAGLAGCIKLALSLEHGEIPPTLHYQQPNPAIDFAASPFYVAEHLQAWPPGPRLAGLSAFGIGGTNTHAILEAWPPATARLLQSEPAPGVAQVFPLSAMQADRLPVYARRLATFLRGPYAASLRLADIAYTLQTGRRSMRSRCAFVAETLDQLLAALDDCADTVDTQDAAAASPPAPADGWQLQRDAHGLAAAWQQGEPIDWNALQAVAPWSARRVSLPTYPFSPKRWPRTAARVAASTQAAVLHPLLHRNTSDFAAYRYSARFGGDEFFLADHVMGGHKVLPGVAHLEMACAAFAQAIAAPQAALELRDVVWIRPVGVDAPLDVHTVLRPQADGSADFEICSQPDAAGERIVHSQGRVMPVDTVESEAELLALDTLRQQLAQAHRDAASYYRDFEQGGASYGPAFRALEQLWLGDGQALARLVLPAARHEGAERYGLHPSLLDAAVQAAFIGINALRAAAGVASSEQGGSLPFELKRVQLLVPCEPVMWAWVRYTQGIGAGERVQRMDVDLCDEQGRVRVRLHGLARLAALVPKLAAPSLQLPQWEDAPLVYGADAPPAYAQRLLVLCGLPAHGLEAESGADLVERLEAGTGLDSVQDYALRLLARVQVFMRDKPRGRCLLQVVIPAKGEMVSLAALAGLVRCLRLEHPGVSAQLIAVDPAIAAPALAALLREEGREAAEAQIRHLHGRRQRRALVPLLPSAATTVSQPWRKGGVYLITGGAGGLGWLFAQEIAARCEGAGIILLGRSALAPGQATRIERLGREGTRVVYRQADVTDAAAVEAAVQAARTLGPLRGVLHAAGVLRDGLLLNKSEEQARAVLAPKLTGSLVLDRATRELDLDFFVLFSSISALGSVGQSDYAVANAFLDEFAHWRATAVARGDRRGRSLAIAWPLWAEGGMRPAVAALAGMPEAWRSVAMPSAQGLAYFYRALASVYAQVVVPSLAQAAPAEATSLVAGAVVASTVILPTAPAAPSSTPSTHKASMASAAVSMIDVEAFAARLRGVVLGLVTQLLGVPTEQVDLDEEFNAFGFDSISLTNLANRLNQELRLNLTPAIFFEHASVNRFVEHLLAEHADRLDFLVQTAAPARAEEAKSMPTVTSAVAATASDDIAIIGMSARLPMAVDLDEFWDNLLAGRDCIGEIPADRWDWREIYGDPLREPNKTDAKWGGFIDGVADFDPLFFGISPREAEAMDPHQRLLMSYVWQALEDAGYTRQALSGSDTGVFVGLGGSDYGQRVVAADGGVESHTLMGLLPSMAPARMSHFMNWHGPSEFIDTACSSSLVAVHRAAQAIAAGDCSLAVVGGVMAILSPTSHIGFSKAGMLSKDGRCKTFSSEADGYVRGEGVGMLVLKRLSAARADGDCIHAVIRASVVNHGGRANTLTSPNPSAQAELIESACRKAGVAVEQIGYIEAHGTGTRLGDPIEINGLKSAFKTLGHPMRGAWCGLGSVKTNIGHLELAAGIAGLLKLVLQMRHRTLVASLHCEQVNPYIELEGSPFYLVRENRPWPTGRDAFGRPAPRLAGVSSFGFGGVNAHVLVQEHLADAPPEEDEASVLIVLSARSENALRGRARDLLVYLERRGLDRSTASVYDNRRTELEQHIRLALAGLLDVDVHEISIDESFEAYGLDALARNRLAAALDESSPALLAATLRAGSVAHLADEWLASQGYAPVSVASVESSIRLGDLAYTLQVGREPMEHRLGFIAASFAQLNERLRAFLDGQDGMHELHRGQAKFGKNGLSLLGADDDMEGTIASWVVKKKYALMLKLWVNGMELDWRLLHVGRQLQRISLPAYPFDYRRCWVGETPAAARETDKAAGASEPPALDVALDETSGEVNQAESAQFLLPVWDAVHPELLADTPPQGRILLYGGDQGMVDAWRGGAVGLATLVVTPGASIEALSAQLAAAGSICEVLFVAPGEADVDPGNAQLIDAQQQGVLGLFRLVKALLATGHGLRDLAWTVVTTQAVDLGDGLPVRPAHAAIHGLAGALAREYPHWRLHLVDVAVDEADDWRNWRRLPPEENGATWCRRGGEWFRQSLLPYQPDTTNLPLPYRQGGIYLVIGGAGGLGEAWSRHMIERYQAQLIWLGRSAPNARIGERIDALAALGPVPRYIQADAGDEAALRRAAAEIAREFPRLDGIVHSALVLADQGLASMSEAEFSGALAAKVDTAVLSTRVFTNQPLDFVLFFSSMMSFGRAAGQSNYAAGCMFADAYARRLAQLARYPVKIVNWGYWGSVGIVSRADYQARMRRAGIGSIEPAEGMRALEVLLRAPVPQLGFVKTRIGVGGDSLRQYADTIPAVTARVMLDTPVPALRREDLLFADQRLESLVRGLLARQLETLGEHPVLPLYQRWLAESRRVLAEVPPPATTVEALWRAWDKHALTRRHLDGSTPELDLLRNCLQALPAILLGQRTATEVLFPQSSLSLVEAIYRDNPVADLFNGRLQQWVLAYLRERLARDPRAQLRILEIGAGTGGTSAGLLAALRPVHEHIAEYSFTDVSRAFLIQAQERFSVGFPSLVTRLFDVEQPLAAQDLPADYFDLVVAANVLHATSEIRTALRNAKAALRKGGVLLLNEIAGQSLFTHLSFGLLEGWWRYRDPAVRLPASPALSPKNWARVLREEGFGEIDFLLKERLDLGQQLVVAQSNGLVRQAEARGGGAEKGRHGSVVVSNTAFVATQARPAAPAVPPQAVAHATVEEPVAVDAVVHRVLLEELARTLKLDSGEIDTQMPFSDYGIDSILGVGLVKQFNDRLGLNLNTTILFEHSSLERLVVHILQRHRREAITALGLVVGPVLGAAEPAPPPAPAVAASCESGRASGAVDELIPAPEVELCRAETGPLEIAVIGMSAQVPGADDAETFWHNLMHGVDGVTQLPAAYLDRARQGDDKQAYYLWGGTLAARAAFDPQFFSISPREAKSMNPHQRLVLQESWKALEDAGYDPRSLAQRRVGSFVGAEPSGYFHETFTGSSDAIIASRLAYFLDLKGPALVINTGCSSSATAIYLACESLRHGESELALAGGVYAVLNETGLVSLAQLDMLSNSGRCHSFDAAADGTVFSEAVGMVVLKRMDDALRDGDPIYASIVASGINQDGASNGITAPSGEAQEQLLLETYRRFSIDPRHIGYVEAHGTGTRLGDPVEANALLRAFRAHTTDRDYCAVGSAKAHIGHTAAASGVIGLVKLLLSLKHGRTPGLLHFRELNPLIEWQDSPFFIPTANRDWPVESGRPRMAALNSFGHSGTNVHLVLKEYVDPVEAADDAQQPGWHLLPLSATNETRLRAYAGKLARYLREAGEGVNLGDVAHTLQHGRIALKRRWCLLVANAAGAIAQLQAYADGADVAELAGVHGLADGTLVTDRIEAAATPELLAALARRWASGAMVDWPLAGPTARRIHLPSYPFARDHYWMDGGALAATTVRTVTAAPRVPIMHPLLHARVDTATGPRFSAQLSGEEAFFRDHRILGRPTFPGAAYLEMACAAAARVLGTEALRLSAVVWSRPLQASAGGVALGIELQARDGDGCWRFEIAAAGQVHCQGLAAKAEARENAFERQDLGALLARMDRQGVGHDDCYAAFDAVGIHYGPSHRAIAHLHLGHDEVLARLVLDPAAALDSEATAAAYRLHPSLLDAALQATLGLSVGSARGGAYVPFALERLVLCASTGFQLWAWVRYTAGSRADEAVRKYDIDLLDDQGGLIARLLGFAFRPVEAVAPASADPVLLWCRDWKPDPVPLVGDSAPVRLSVLVSDRAAWRFDEPTDGSLLAHLQIDSIQEAPDQWYGRVLRGLFELIKAQAQKTVQACLLQVLVPAWGRAALLSGLSGLLRTATLEFPRLSTQLLAFDAPPVGLSALLEANARQPWHAQLRYLDGTRLAPAWAEVAGSTGAQMSALPWREHGVYLLTGGLGGLGRLLAEEILGQTPDATVVLCGRAAPDAEAAQWLRTTGGNGRLRYRRADVADAAQVREMVADIVVCCGTLHGVFHSAGVLRDSYLPGKTSAQIDAVLAPKVAGTVNLDEATQSLHLDLFVLFAAGAGALGNPGQADYAAANAFLDAYAGWRNEQAAQGRRRGQAVAFDWPLWAEGGMRISDSQREAIEAANGMRALERDEGIRALYRGIASGEAQLLVGVGDHARLRAWIQHLHAEPTAPAARSAQTGTVVPAGTVRALRERTRQFLIDSAASILELQPQDLDPRDELSDYGFDSITLTELTHRLNRQFELELVPTVLFEHPTIAKLTAHLLESFPEALARAFPEDVAGPPPRATQAPLATPERDMGEMRAADALRNTNTNTHAGAVAIIGMSGRFPGAPDLASFWQVLAEGRDCIGEIPADRWDWRQYYGDATQDGRHTRVKEAGFIEGVAEFDPLFFGITPREAGLMDPQQRLLLTHAWAAIEDAGYAPSSLAGGRTAIFVGTAPSGYSSLIEQAAMGLDGHSSTGSVASLGPNRLSYLLDLHGPSEPVETACSSALVAIHRAVRAIRHGDCETALAGGVNTIVLPEVHISFDKAGMLSPDGRCKTFSRHADGYGRGEGVGILLLKDLAAAERDGDAIHAVIRSTVENHGGRASSLTAPNSKAQTELIKAAVRDAGIEADSIGYIETHGTGTALGDPIEINGLKAAFEELQAEAGGAAPIPDSCALGSVKTNIGHLELAAGVAGVIKVVLQLRHRTLARSLHAEEPNPYIRLAGTPFYLLGETRPWPMPRDAGGRPLPRRAGVSSFGFGGVNAHVLLEEYVGSRAELPASEPRGPYLFVLSARQPARLVEQAAQLVAFLRDETRPALAELIYTLQVGRDAMEERAAFLVADYAELTQRLAEFVAGRAGAWLHRGRARHDSVAATLFDDQNEQREAVRVWLAQGRHDKLLGLWTQGLEVDWRVLHAGPPPRRAHLPTYAFARERYWPPRPDRPAARPETRVNPLMMQEEAANEDELDRHSALLDQLIHRQISADEAVRLARRQEQL", ], "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5": [ - "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", + "AXA20092.1", + "AXA20092.1", set(), + "MTDHVAEIYHQVASGQLSKDEALARLCQVKDEQAAQGVAKQVHKPAQPVAVSREAVTQALAALYAGQSKIAPEAIDPLEPLESYGIDSIVIAGMNRELGERFASLSKTLFFEHRTLDSLAGFLWREREPACRAWLLASGTTVRTVAASPASRIEVAASAAAASAISDEPIPGDRAGTGAPETFGPANGMAVDASAWPPIAIVGLAGRYPQAADLDAFWRNLREGRDCIVEVPVERWPLDGFYEPDPDRAIASGRSYGKWGGFLEGFADFDALFFSISPLEAMGMDPQERLFLQSAWHALEDAGYTRESLAERCGGRVGVFAGVTRNGFGLLGLEAWHAGGMVFSQPAFSSIPNRVSYALDLRGPSLPVDTMCSSSLTAIHEACAALHRGDCVMALAGGVNLCVHPASYVGLATGRMLSRDGRCRSFGAGGDGYVPGEGVGVVMLKPLAEAQASGDRIHGVIRATSVNHGGRTNGFTVPNPTAQAELIAESLRKSGIHPRAIGYVEAHGTGTALGDPIEVSGLVQAFAPYTRERGFCALGSAKSNLGHLEAAAGIAGLTKVLLQMRHGELAPSLHAAQLNPNIDFEGTPFVVQRELAPWAEPELDLDGQLRRYPRIASVSSFGAGGANAHLLVEQYLAPAVPSVSSAGPQAIVLSARTPERLCAAVQALLDHVESGGGTAAGLAANLSAWLVEELAAIVGVEATAIDPHETLANLGVEILHRTRWYERVQERLNLPWSLKNFLDQDSVQQLGNTLLREQGATVVAQFHAPVAAPVLADLAYTLQIGREAMPERLAFVAADLAELATGLRGFLDGASRLPLWHGKAARGRALPARVDQAQWQAWIDARDWSQLLPAWVAGHELPWRDMPGAPGARRIGLPLYPFAAERYWVDPQSLRPRPTASLESRLHPLVRKRVDSVEGPAFLSRFSGAESFFSDHRVGGRSILPGVAYLEMARAAASLAADGATIRSLRNVVWARPIEAGADGVAVTLRLQPHQQGSWRYEVLGADDGVHGQGLAELALPEAPPVDLDLAALRARMQGGALANEALYQAYAAMGIAYGAAHRGFVQALVGESELLAELCLPSAVQADAQAYVLHPSLMDAAFQATLGLYLLSKRADAAKAMLPFALETLELHWAPPARVWAWIRSRGERSGIEKFDIELCDEQGRICVRMLGFSSRVLEAPAVSPEVPPAVLEAPSLLLSRYAWQDAPARRAAPDPALTRRLLLVSIAPQPVVWRQLGQGELLQAAAVQPEQACASLYVQIFERVQAWLEEKGRDTCLWQLAISGQGAELLLAGLSGLLRSASQESRRLLGQLMILEGDEDLASLRARLDENAASPFDSLVRYRAGRRETWHLLELPSNDGEAEAAPLPWRQNGVYLLSGGAGELGLLFVEEIARRATGATLVLTGRSALPDARRARLDALCEQGAQYRYEPVDVTDRQAVTQLVEYVVAEYGRLDGVLHIAGVLRDSYILKKDRAAFEQVLAPKLLGTANLDHATRTLDLDFFLMFSSSAAIFGNLGQTDYAAANGFMDAYAAYRQARGGRGRSLSVNWPLWRDGGMGMEAATEEMMLANTGMVAMRTPSGFSALARALHSDLPQVAVMEGLVERMRQKLLVPSAPSMQAVPVASASAAIAPSAQTDHHAEIVARVARGLRQMVAELLKLELDQIDIEDDLSDYGFDSITFTSFSNRINKQFGLELIPTIFFEYPDIAGLAGHLAEAHGAALGASLGLLAASAQGDRAQTRSAMSAEAVATANVSAEMPVPLAPQLSDQSAAASNTPVARRGVAVIGISGSFPGADGVDALWQLLERGGDAICEVPASRWDWRRCLPPGESEAVQARVRWGGFIDGVDRFDPLFFGISPREAELMDPQQRLLLSYAWLAVEDAGYAPQSLGGTDTGLFVGTAVGSYGSLVVQAGRSRDAYSSTSSVASIGPSRVSYFLDWHGPSEPIETACSSSLVAVHRAVQAIESGRCEAVLAGGVNTISTPEAHIAFSKAGMLSVDGRCQTFSAKANGYVRGEGVGMLFLKDLVAAERDGDTIHAVIVGSAENHGGRATSLTAPNPKAQAALLKAAYAKAGFDPRLLGYIEVHGTGTELGDPIEVNALKSAFKDLYQRAGVEPPGQPHCGLGSIKTNIGHLELAAGVAGIIKVLLQLRHRTLVRSLHGEQVNPYVQLEGSPFYLVQENLPWDAPRDAQGREQPRRAGVSSFGFGGVNAHVVLEEYVAPPARATAPAACPVLVPLSARNETRLREAAARLADFAAAHADDAALDLHDLAYTLQVGRDAMEARLGLMVSDKAELARCLRAWLDDAGAGEVFQAAPGKAQKEALALFAGDEELAGVVEGWWRNGKQAKLLDLWVKGLDLDWARLRAGAGRRRISLPGYPFANERYWLKPVSAETAALGLVASTPIETDEAAVLFFEENWHPYPIREALIASSTRTLLCCLSDATHRKALREAVFRYDPKWHLVFLDRQAPEPFDRQGWTDALCLLEQGGPVIDAVLYLRPLEEAGLRLAEAAPLGLVQALGSMKTRPARLVLGGEYADESERSQLEAWIGLERSIGLALPGCRAVTVLREAGDTIDWVAWTQLLRTALGEAAPRNLLADRDSLRHLQVQPLEPRAAAVADLGTTVLITGGTGGLGLILARHLAVGRRCNLVLVGRSPFDVVRQAAVQALQAAGSEVLYLSADVADAVAMREVVAQARARFGSIDSVIHAAGIQHAVPLADKQSEDMRRVLDPKVRGALVLDQVLAGEPLRLVCYFSSSSSVLGDFGSADYALANRFLSAHALARERRRARGERAGRSLSIEWPLWREGGMGVGDDAGTALYLKSSGQRLLEQTEGLAAFERLLASGATRALVLVGERERLHRMLGLAQVPSAPATQAMAVLMPSQTQTFSTASLEEQVSAELSVLIGDQVKLAPELLDAESNLADFGLDSFGLAELARALSARYDIEVAPSIFFAYSSIARLVGYLLDKHRAEVQAHRHRTATRVDVAAIAPQPASLAPPAAISMSTPAPPQLPVANAIEPAPTVPAFDGEREPIAIIGISGRFPKARDVDQMWRILAEGIDAVDEIPVERFDWRDYYCGLEAQPGRTNSKWAGCLDGVDEFDPLFFEISPREALAMDPRQRLLLQESWNALEDAGYGPHQLRAGPVGIFVGVEEGDYQRVVPDPGVTSNHNGILASRLAYFLDLNGPVLAINTACSSGLVALHQACASLLSGESDTAIAAGANLILTPEPYIGMSQAGMLSPDGRCRAFDRSANGMVPGEAVAVVVLKRWSRAVADGDPIRGLIRASGINHDGRSNGITAPNALAQASLVRQVQRAAGVLPEQIDYVVTHGTGTRLGDPVEIQALVEAFGQPVDGRAYCALTSSKGNFGHTFAASGLLSLIGLVKALEYDTIPPSLYCDQDSDYIAWRDSAFRVNKQARSWPRASGRARLGAVSAFGMSGTNAHVLVQEAPVAVARVAVAQTDVVLALSGKTEAALRERLFGLRDWLASAAAERCELASVSRTLLDGRHHFAHRAAVVVANRASAIAALEHLATLDTADGPDYYLGKAPRGFKGEVALRERAANWPALPSVDAAAYRERLGELARLHCQGYTVAWSALDGLQPAARVHLPGYPFARESYWPKTRISPPVATSVPASPAFPSSHSTPVVISLRPMLRTELSDQARGHARACYVARFSGEEFFLRDHRVRGQAVLPGVAYLELARAALEASSGRPVPSGLQLRHVTWVQPLMVDEPGVEAYIDLHRQDNGEWRYELASGAHDESERYLHGQGFLALVAQAEPTALDLSVLHGNCRVTEFDSAACYAAYQAVGIEYGPSFRAVQRIWVGEGQALVQLRLPVEALSDSGAYTLHPSLLDGALQASIGLAMAQATQGGEPMLSLPFALDSLVLHWPCPNETWVWIRPTPGAQSSRVRKLDLDLCDAQGRVCVALRGFSARLVAQGGTAQPALIPARVEAERVSMAAPINGLASASLPAKAKGVVPILAPAAKATGASTALPATTEGNASSAIPVVKTGAAPGAWLPVGLTMLAPLWTVRRVDDGSEPPAPVHMLLIGGDATQRAIWRQAYPQLRAIDVAPSTTIDELRGQLAATGVIDELVWIAPAQHSQDPTDEALLTAQASGTLALFRLVKALLAEDYASRRLAVTVLTRATQQVHPQDLVAPAHATVHGLAGSLAKEYPHWSVRLIDLDGQNADPPPERCRALPVGSESWAWRRGEWHKPELIALDEPTRGKVPAPYREGGIYVVIGGAGGLGEVWTRHLIEHYRAQVVWIGRRAEDAGLRARLAALAAHGSAPVYLQADAGNRLALSRARETILQRHGRIDGVVHSALVLQDRSLARMDETTLQSALRPKLDVSLRIAQVFADAALDFVLFFSSMMSFSRAAGQGNYAAGCTFKDALALALARRWPGAVKVMNWGYWGSVGVVADARYQERMSRAGIGSIEAEEAMVVLERLLGGPDAQLGLIKLSRAQAVEGVRDDLRGARYGAALPALLPQLAARPLPPEHASRLAAAQAALPPQAMQALSLRLLGQALLGFSLDGRHLLPGGAAGLALAGHYRRWYDTSLRLLDAGGWLQSLPNGDYQILATAGQQDAWPEWEAARAAWLANPQQQAWAQLLEVCLRALPELLTGQRKATDVMFPNSSLRLVEGIYRGNPIADLHNHILFDALEAYVLERLAREPGTRLRLLEIGAGTGGTSAGLLQRLDRYAANIDEYCYTDLSKAFLLHAEQHYAPGRPFLRTKRFNVEEPPQAQGIAADSYDIVVAANVLHATVNIRRTLRHAKVPLRAGGLLALNELGELSLLTHLSFGLLDGWWLYEDPALRLEGSPGLSSEGWERVLAEEGYAPLWRPAEECNRYGQQVLLAQSDGRVKRDVAVPPEMAAAPVEEVSAPASAFTSAQVADSTPAATFAPVTAPIPVPDSRDLEQAVADHVRTLLRECIGKGLDLDPRRIEADRSFSEYGVDSILAVQLVNEINQRLGIVLQTTVLFDYSHLDVLAEYLEQTHQAALRASLPEVSEVPALQAASTLAPKIQAQPSGVAFPLISPTQPIGATLPFVPPSVTGSHRRALISGPGQIQDLRLVAMEVPASLQPRQVRVAVFASSLNFSDLLCVMGLYPNMPAYPFTPGIEASGLVLEVGSAVSTLCPGDEVVCLAQGCHATEIVCHETQAWAKSPQLSFEQACALPVVALTMIDAFHKADLQPGECILIQTAAGGTGLIAMQLARHYGATILATAGSQEKLDYLRDQGAQHLINYREQDFEAEVARITGGRGVDVVINTLSGEAIDKGLRSLSPGGRYIEIAMMALKSAQAVDLSVLDSNQSFFSIDLARLIAERPEKLEQYRRELASLVEQGVLLPTMSRVFALDQLHDAYRYLQDRRNIGKVVLQVPQAVPLADQASAARAVDAVAGVHKAQPVPVNYADEPIAVIGMSGRFAHSPDLDSFWSHLAKGHDLVDPVLRWDLSPSGGRCRDGSFLDEIDRFDPLFFRMSGLEATYMDPQQRLFLEEAWHTLEDAGYAGEAVKGKLCGVYVGCTRGDYAQLCKSAPPQAFWGNSGALIPARIAYYLDLQGPAVAVDTACSSSLVAVHLACQGLRSGDTELALAGGVFVQSTPGFYLAANPAGMLSATGRCHAFDESADGFVPGEGVGAILLKRLSDAIADGDHIHGVIRGGAINQDGRSNGITAPSARSQERLERQVYDRYAVHPETLQMIEAHGTGTQLGDPIEYRALRQAFGHYTQRVGFCALGSVKTNIGHLANAAGIAGILKILLALRHRQLPPSLHFRKGNPAIDFEGSPFYVNTELRLWPAGERAPRRGAVSSFGFSGTNAHLVIEEAPAVPLVARATRRELELVVLSARTAGQLREQAARLLAHCQAQPQTSLGDLAYTLLCGREHRGYRLAAVVRDLAELCEVLSAWLEQGDDSRLQLGALDESGVREQLQQRRLGQVAIETVRAGQLEKLSSVAELFAQGYKLDYAGLFGSGYRRLALPTYPFAQGRYWVDDSLQHAVVPSTPATAPVVPTPVVPAPAVTQAEARQAPSRISTVPDQVMREASVAYLKQLVATTLRVSPTEISAHEPLERYGIDSILVVQLTDSLRQHFDSVGSTLLFEVQTIDALAERLLATEAPALARQLGMDAVAPLEATGSAIEAELPESPPPQPETNSQVQPAPAVVTVAETVGASAAAESSQPKAHGDVAVIGMSGRYPKAIDLNEFWWNLRAGRDCIDEVPAQRWDWRKHFDAQRGLHGRSYSRWGGFIDGVDQFDPRFFRIPPSEAEHIDPQERLFLQTAWLAIEDAGYTPSTLSAKRRVGVFVGAANSTYTLLPSHWSIANRVSFALDFHGPSLAVNSACSSSLTALHLALDSLAHGSSEVAVVGGVSLVLHPMHFNRLSSLGMLSSDAHSRPLGEHADGFVDGEGVGALLLKPLQRAIEDGDSIHGVIKGSMVNAAGKTRSFAVPDAAAQARLVREAQARAGVEADTIGYLEAHSNGGELGDITEMQGLAEAFAGTAERGHRCAIGSVKSNIGYCESAAGIAGLTKVLLQLRYGELVPTLHARCANPRIDFAGTPFALQQELSAWPRPANHPRRAGVSAFGVGGAYAHVIVEEYVAPVETQPEATGRALPIVLSAANAERLRVLAKRLAGFLGSEAGRRTALTDLAYTLQVGREPLAERLGFIAESVEQVREVLLAVAEDREVPLPLVRASLDRGRAGWAMFAEDEDFKRTVEQWIAREKHASLLDLWCRGYPLDWRHLYAAHRPRRIGLLPGYPFAEESYWAPESLRYAGVLEDADAFDATPFEPDQPAGEQS", ], "RyDIaUZc_b21_kQalx7J3yNO4l5f-439": [ - "RyDIaUZc_b21_kQalx7J3yNO4l5f-439", - "RyDIaUZc_b21_kQalx7J3yNO4l5f-439", + "AXA20093.1", + "AXA20093.1", set(), + "MKPNLNQDFDATPSSHAADRRPQAGAGGCVRAPGHVDTAIIGISARYPKAVDWRQFWENLRAGRDCIVEIPPERWDWRAYHDSARGTPGRSYTRWGGFLDGIDRFDPRFFRIAPSEAEHLDPQERLFLESAYLLIEDAGYTPASLSASRRVAVFVGAMNSSYSLLASQWTLANRVSHVFDFHGPSLVVNSACSSSLSAIHLAIESLATGTSEVAIAGGVNLIMHPAHYARLASVGMLSAGSHCRAFGAGADGFVDGEGVGSVLLKPLQRAIEDGDLIYGVIKGSALNAGGRTHGYTVPSPVAQGRLVAEAIERAGFAPHSIGCVEAHGTGTELGDPIEVRGLAEAFGAPVGAAPWCALGSVKSNIGHGEGVAGIAGLTKLLLQMRHGQLAPSLHADTLNPRIDFNGTPFSVQRKLAPWPRPAGHPRRAGVSSFGAGGANAHLLVEEYVAPIVPAPTDADSPALIVLSAANLDRLRAVAQRLLDFLNGEFSSGITLAELAYTLQVGREALAERLGFVADSLGQVRACLAAFLEDREAGRPLLRSSVGQGRAVGSGMLDDESFAQTLRGWIKRGKHELLLKLWGQGEPLDWSLLYRGARPRRVSLPGYPFAGERYWAPAAVRYAGVVCSRRRPAIDPDLLLCQPTWRAASLPAATGRALPHRELWLLGSQARLDDAALPALPIERFRSEQAEPVARCVDLYGQFHARLRARLRDKLSEPLLLQVAILGRDDELLLSGLSSMLRSLGQESRKLSGQLLVMEGGEDRATWQARLDENAVRAHEDWVRYRQGRRETWALQELPPATVEPALPWRARGVYLLSGGAGGLGLLFAEEITRRAEGATVILASRSAPVETRRARLAALAEQGLAIRHAVLDITDAAAVQALVDEIVASYGRLDGVLHLAGVLRDAYLVDQERDRVDQVLAPKLLGALHLDLATRMLPLDCFVLFSSAAALFGNAGQADYASANGFLDAFAVYRQTQGRSGRSLSVNWPLWAEGGMSMDKATEQLLTAGTGIRPMRAATGCRALAHALAGAFPQVLALEGEPVHMRAALLGQSSPVAVATAGDATSPKTSLSCQVRELVAALLKVEPEQIEAGQDIGDYGLDSIGFTHLANQLNLRFGSALRSTDFMELEMASVERIARLLEQKLPALSTGTTPVTRRDRGPSAVPVTPTPRDAGPAAHSDEPLRAQVREAVAALLKVEIEEVDLDLDISDYGVDSIGFTHLANRLNEQQGTRLRATDLLELEQVSVIRIARLLREDPGSRTLLDAVGADASLAVVEGR", ], "DTee9G4M8sEfnM4HaPfI37rT74pq7M_G": [ - "DTee9G4M8sEfnM4HaPfI37rT74pq7M_G", - "DTee9G4M8sEfnM4HaPfI37rT74pq7M_G", + "AXA20094.1", + "AXA20094.1", set(), + "MSVYMFPGQGSQAIGMGTDLFVSFPELTEAADRILGYSIRELCLEDPKHQLGQTRYTQPALYVVNALSYRRRLHEYGAPAYVLGHSLGEYNALEAAGVIGFEDGLRLVRKRGELMSEAPPGAMAAVIGPDEVAISALLARHGLDAIDIANLNSPSQTIISGLKEDIARAAPLFDAEQAHFVPLNTSGAFHSRYMTVARQAFVAYLGEFHFNRPRIPVISNVEAQPYVLERTAELLAAQITQPVQWTRSVHYLLALGQSEFLELGPGQVLTRLLVEIRKHTPTVAPATAGSLSSTPAYDRERELSELQQRIDDWNGRYPVGTRVQVERYPQQLVTRTPAMSLFGHRAAIYLEGYNGYFDLADVHPLHGASA", ], "mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn": [ - "mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn", - "mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn", + "AXA20095.1", + "AXA20095.1", set(), + "MNAIPKDYAVPSISIERLGSVAFKQDHGLRYPYVAGSMVKGIASTAMVINMGRAGFLGYFGTGALDAVSIERAILEIQAALGDRQPYGMNLLSNASTPQAEMDTVDLFLKHGVRRVEASAYMQITVPLVKYRASGLRRDAQGAVVARNMILAKLSRPEVAALFLSPPPDKLLAELVAGGAISTAEADLARLLPMADDICVEADSGGHTDMGVLSALLPSIVRLRDELVAHHGYARTVRVGGAGGIGTPEAAATAFILGADFILTGSINQCTVEAGTSEAAKDLLQQVNVQDMDYCPSGSLFELGAKTQVLKKGVFFPARANKLYELWKNHSSWEEIDAKTREQIQNKYFMRSFESVYEETRAYFLRAEPGEIEKAEKTPKHKLALVFRWYFVHTMRLAMSGSSQQKVDYQIHCGPAMGAFNQWVKGTELESWRNRRVAEIAHRLLEETVQLLNRSFLAMSS", ], "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh": [ - "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh", - "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh", + "AXA20096.1", + "AXA20096.1", set(), + "MLELTKRLADALVSISLFAACRESGLGALLKDRAGLPTRAAQLTWLAPQCGIDEARLEAALQALRDAGWIEALEDGRLIPRATFERVEPWSEAVAVGLDRDWGALLREQDGRRLRHWLEQGAAARESLAGCQAEAEALDAAAMAPLLFELARLDDAAWLQGRDVTSLAPANAALLRADFLRRGWSLDEAAGLIPNVQGLAMLRDAAALGPLLFLARRPEAGARTLASTVALYRANLRWRNALDQAIAVADSSAHEAWPTGACVMAAAAIGCLPERPSPSRPAVKPGPARFALHQWTARPYRVRHPSLDDLAILRELDLASWPVGMAVPENELRRRIEQFPQGQLLIEQDGEVIASLYAQRIDTLDQLRHTPYARFAWIHRPRGALAHLMGICVAPDWQGHGLADQLIDFCLVYLASFEGIDSVAAVTRCHEYGRFGDKVTLDDYIRQRDEEGRYREPMLQFHASHGAIIHEVVPGFRPEDQANHGTGVLVEYAHYRQAPELAGPVVAVDSVGPGSAALDVAEAVRASILDVLGELHAAAYGPQVPLMEMGFSSFHLQELQRSLGERVGLKLDATFFFQHGTPAGIVEHLRERLAPIGTEQADIRSTVDTETTPSSATTDGIDVPERIAVIGVACRFPGGVGNPEQFWTLLENGVDAIGEREPGVSPTAASTRRGGFISAVDRFDAGFFRISPREAELVDPQHRLLLEVVWEALEQASIAPGRLAGSDTGVFVGVMGHDYERLLRQQGGAPPIDPYFATGNANSIAAGRIAYYYDWHGPTLAVDTACSSSLVATHLACESLLAGECSLALAGGVNLLLHEDMFAAFEQAGMLSPEARCKTFDASADGYVRGEGCAMLVLKRFSEAQRDGDPVWGVIRGSAINQDGASAGLTAPNQGAQQAVIEAALRKGGVVPHALRYLEAHGTGTRLGDPIEVLAACAALSAGRPIGQPLLLGSVKTNIGHLEAAAGMAGLIKVMLSMRHGLIPRHLHLQQPNPHLDWAALPVEVVSEARPWPVGPKLAGVSSFGFSGTNAHVVLEEYPANPANTVPMAARSSALLLLSAKREEVLQTQVRQLHEAIGALDEADLPDVAYTLQVGRDAMEYRLALAVGSLAELRQALARFLAGEAGIRQLWQGRAGQQGYLLGSFVLDEAFTASIATSLAAWFARGELGKLAELWVQGLDVDWRRLYGANPPRRISLPTYPFMKERHWLPQAVAQATAEASGAPLLHPLVHRNTSNLAEQRYSSRLDQQAFYLRDHVVQGRHVLPGVAQLEWARAALALALGDTSASLRLEQVSWVQALTVEQALEVHIGIEADEGGWLTYEIYRGSDDEVELYSLGRARLDAERKVPNLDLATLQARCTRRIDGPACYARFTRMGLGYGPAFQVLTELHVGADLAIGRLQVPVGIELGDYRWSPSLLDGALQASYGLVDETAGLQLPFAVESVEQSHALPESALVVVQRAADDSGVLRKLDISIVEESGRVALRLTGFSTRAVQAAAPADSLLMVPRWQARSAVEAPPEPGYRTHRVVLCEFEALRGGFDAALPAASVVHWQAPGSLAERYARYAGQLLCELQALAADHPADPVLLQLVVPAQGEAAVLQGLVGLLCTAQQEYPWLHSQVIALPADAPVADCLAREAAAPVPRVRYQGMQASTREVMDFVEVPPTETAMPWREDGVYWITGGLGGLGLLFAAHIARQVQTPVLVLSGRREPDTAGQAQLDALRALGANVEYHALDIADAAAVAALARNIIARHGCLNGVIHGAGVLRDGLLHSKTVDELQEVLAPKVTGMMALDHATADLELDWLLLCSSMTTVIGNTGQGDYGAANAYLDAYAVHHEQLVAQGLRRGRAISVSWPLWAEGGMRIDAEGQAYLRRSTGMQSLPSEIGLAALEQLLATPRAHSLLLYGDRTRLLARVQALYRAPEPVVVRSVLALAPAAGPVDSQEALRKTARRYLTRLLSRSLKLPPQRIDVQTPLEQYGINSILVVSLTRDLEASFGRLPATLFFEYQSIAALTEYFLAHHADSLTMLGAAPAATQLMAVTSSAPAREASIDAPALRRRRHRRSLPGSMVAGPPVSTVGAPLDIAIVGLSGRYPQARSVADYWANLLKGIDCVTEIPAERWDWRQHFDARKGQDGKSYSKWGGFIDGMDAFDPMFFGIAPREAQLMDPQERLFLQCAYHAIEDAGYTRAGLAASATEGERRGQVGVFVGVMYEEYQLYASQAQARGQGLSLFGSASSIANRVSYHCNFHGPSLAVDTMCSSSLTAIHLACQSLRQGGCSVAIAGGVNVSVHPNKYLMLSDRQFMASNGRCTSFGEGGDGYVPAEGVGAVVLKRLEKAIADGDHIHGVIKGSALNHGGKTNGYTVPNPVAQGQVISQALAESGVPARAISYVEAHGTGTLLGDPIEIAGLSQAYGASTQDKAYCAIGSAKSNIGHAESAAGMAGLTKVLLQMQHGQLVKSLHSDTLNPHIDFSQTPFVVQRELGPWTRPVLEVDGGGEHEYPRIAGLSSFGAGGANAHLIVSEAPASSRHEAVPRQGPVLVVLSARNENILRCQAEQLLTHVQSHASDLANLAYTLQVGREAMEHRLAIVAASTEQLSARLNAYLQDDTLDEAVYRGEPRRSQEAMAVFGGDEELQEVVAKWIARGKLEKLAELWVQGLLIHWEQLYGQAMPRRISLPTYPFAQERYWIDAGAATMRVAGAQILHPLVHVNSSDLQGQRYTTVLDAGTGLLRGHRLHSRPTLPALAQLEWARAALAHALGGTAGLCLEEMRWLVPLHVDAPTTLHIALDWEDETHVGYEIYREDDEGREVYAEGRAELVDALPAPRLDLPALQAQCTQHLDGDEAYSRLATAGWSCADSFQALSSLQSGEGLAIAHWRQKVDASWQDYALAPNLLDVALQACRLAWPQQDWSWPTAARQLRMVGSLPVQGMVVVRQHPGQLDVDIDFADGEGYVLASLQGLAPQQPSTQASPVAQTLLLAPCWTPQAGPPAACERPSYAAHWVVLCELDAPASLEAELVPAHCLRWQAEGNPAERYGVYAGQALAWLQKIVAGGPSGQVLLQLVLPARGEAALMQGLGGMLRSARLEYPWLLIQVIAVDSAQELAVRLNTEAIAPVPALRDGAAGREVLDFIPLAPPEGVRAMPLAWRDEGVYWITGGLGGLGRLFATAIAAEVRCPVLVLSARRPPDTAQAAFIERLREQGARVEFRAMDTGDAAAVEAVARAIVAEHDGLNGIIHSAGVLRDGLLANKREADLRQVLNAKVGGLFALDMATRDIGLDWFLLCSSVSSVLGNAGQTDYAAANGFMDAYSAYRQELVEQGRRRGRCVSLSWPLWAEGGMHIDAVAQEQMRRATGMQALPRAAGLAALHQALAATVSHVLVLHGEPQRLRDYVSTAYRVPALAEPVAKMSPGRGYRRELKGLDLADCVNWDLVEHTSALLQMPRDAVDTQANLIDYGYDSVSLTAFAARLGEHYGIALTPSLFFSHPTLEQFSVYLLDSHGDALAAFYRAASEVERAEAGPAQLGAPTAATSASIRRRRHAQLAGAVANIVEPIAIIGISGRFPGARSVDELWTILRDGREVLQSAGTERFAAWPPPQRPACDRIGLLPGVAEFDPLFFEISPREAEAMDPRQRLLLQEAWRALEDAGYGVTQLQLHTVGMFVGVEQGDYQLIGKTEADVTSNHDGVLATRLAYALNLHGPAMAINTACSSGLVAAHQACLSLRVGECDTAIAAGVNLLLTPAIVRSMEQAGMLSPEGRCHAFDRRANGMVPGEAVVALVLKRLSRAEADGDPIHAVIVGSGINYDGKTNGITAPSGAAQTRLLQSVYARHHIDPADIDYIVTHGTGTPLGDPVEINALADAFTPHERAPQSCALTSSKTNLGHTQAASGLVSLVGLVQAIRHETIPASLHCEQLSDHIAWQKSPFYVNTAARPWPAPTTRARLGAVSAFGISGTNAHMVLRGHAAPADAGRHAAVRPLLLAVSAKTAEALRQRVQDLIERLQAREHDAAELASISHTLLVGRQHFAHRCAVVVQDREDAVYALQQALSRETRANLFRGVVSREFAAQKALLDYGQELIGRIAGVQQTTKQVQQQAGAQGEASREALSALADMYCQGYALAWHELFGQTPPNRVHLPTYPFARETYWVKPTKHAEAGEAVQLHPLVHRNTSDLDEQRYSSRLVVDAFFLRDHVVRGCSVLPGVAQLEWARAAVALALGGEPSIRLGQVDWLQPLVVEQAAECHIALAPLDDGRLAFEIYGDNGQVHSQGWAEAVSPGQVPRIDLAGLRARCTYRLTGEQCYARFVRMGLNYGPSFQSLAGLRRGEGIAIGELRWPADVDQEAAFVLPPSLLDGALQSCIGLYAESTGLILPFAVETVEQWGAVPATAYAVVQPGADDNEAVRKLDIRIVDEQGQVAVCLSGLSLRSVAPASTAVGTLMLAPRWRVQPALATNMVPANVAHCVIFCEVAPVDLRETLPTASSMHWTAEGSLDERYTRYAEKLCIELQTLEASRSDRLWLQLVVPAQGEHAVLQGLDGLLRTAGQEYPWLVAQTIAVKDTSNLAARLAAEASSPAPRLRYGEAGREILDYAEVFEPRQGVRPWRDRGVYWITGGLGGLGRLFAAHIARQAQVPVLVLSGRREPDAAGQAQLDALRALGAHVEYHALDITDVAVVAALAQNIVGRHGCLNGVIHGAGVLRDGLLRGKTVDALQQVLAPKVAGMMALDQATATIELDWLLLCASAAGVLGNVGQGDYAAANAYLDAYAVYRDELVAQGHRHGRAISVSWPLWAEGGMQVDAAMQAHLQRSTGMQALPSEAGLAALDQALSESGAGQVLVLHGERARLLGHVQAVHTALALEAQEETATLVAQEAGMDFREAAQRFLTRLFSQSLRLPPQRIDAKVPLEQYGINSILVISLTRDLEASFGRLPATLFFEYQSIAALTGYFLEHHGAALHALLGWSKAEAGAVSTLPTPSPSPAPVNAAPSLPARRFSGRLLDRFSRRNALTSGAPPDTPPVPLDIAIVGLSGRYPQARSVADYWANLLQGIDCVTEIPDERWDWRGQYDPAKGKLGKIYSKWGGFIDEVDAFDPLFFNISPREAELIDPQERLFLQCAYHAIEDAGYTRAGLAASATSGERRGQVGVFVGVMYEEYQLYGAQAQAQGQALSLFGSASSIANRVSYHCNFHGPSLAVDTMCSSSLTAIHLACQSLRQAGCAVAIAGGVNVSVHPNKYLLLSDRQFMASNGRCTSFGEGGDGYVPAEGVGAVVLKPLEKALADGDHIHGVIKGSALNHGGKTNGYTVPNPGAQGQVISQALTEAGVPARAISYVEAHGTGTSLGDPIEIAGLSQAYGASTQDKAYCAIGSAKSNIGHAESAAGMAGLTKVLLQMQHGQLVKSLHSDTLNPHIDFSQTPFVVQRELGPWTRPVLKFDGGGEREYPRIAGLSSFGAGGANAHLIVSEAPASSQHEAVPRKGPVLVVLSARNENILRHQAEQLLAHVQSYAPNLENLAYTLQVGREAMEHRLAIVAASIEQLSARLNACLQEDTLAEAVYRGEPRRSQEAMAVFGDDEELQEAVAKWITRGKLEKLAELWVQGLSIHWDRLYGQALPRRISLPTYPFARDRYWVPKNLPSIDAASTQAAVLHPLVHRNTSHLGGLRFSTRLDPQSWLLREHQVQDHGLLPGAAQLEWARAALSLALEGAKVRLRQVTWLRPLLAEGEAELHIVLRVEDDGRISFRIYREQDGDTLVYSHGWAEALGEQTPAPTLDLAGLLEGCTRHWSREEGYARLEAMGLHYGKNFQVLMSWQIGDAAVVAELRAPDVERLAGYGLPPNLLDGALQASLGLAGEQVGLSLPFAVELVEQWGPVPSPAYVVVHRAVGDSAVVPKLDIDIVDAQGQVAVRLGQFSRRSVEALADAESSALVAAADAVREWTLAPAWDIADLDAHQNANQDGFQKQGTLVLGEADWIGSSGLRNLDWEPEASRECLAERLGEQGELQQLIWAVPSAEPHAALMGLRLVQALLALDYGTRSLQLVVVTRQAQAVWPQETADPRQASVHGLVGSLAKEYPHWRISLIDLPAQITQDGHQWLAQAAQAADSRGDARAWRDCRWYRQQLVPCRLPAVQASAYRQGGVYVILGGAGGVGVAFSEYLVRHYQAQVVWLGRRAEDAVIAEQRARLGGFGPAPWYLRADATDRSALERAYARIRQRHGTIHGLVHAAIVLADRSLAGMPEAVFAAALDAKAATTENFDAVFGTEPLDFQLFFSSLQSYTKSAGQSNYAAGCCHADAYAHGLRQRRPYPVKLMNWGYWGSIGIVSAEGYRERMAQAGVASIEPPEAMIALERLVAAPLHQVAFVKTTTAQMPPLLAFDPQARIELAKASPALSLPAPVALPKPDAAYAEQAAMEARLARLLWAKLTAWGWHGAQAPGLVPAYALWHQASLRLLGEQAQPVVTAADEAILSAQWQAYVHALRDDKALGAHVRLADVALQALPAILRGERAATEVLFPQASLNLVEGVYRNNPVADYFNAVLGERLQAHVQARLAQDPRARLRILEIGAGTGGTSEGLFRCLAPHSERIAEYAYTDLSAAFLRHAEQQYASLAPYLHTQRLDIERSPLAQGFEAGSYDLVVATNVLHATRDIRRTLRHAKSLLRAEGRLLLNEIEGTSLFAHLTFGLLDGWWLARDPALRIPGTPALSWNSWREVLMGEGFRPVLAPAFEAHRFGQQIVEAGSDGVIRVQAEVANAPVVASEVAVAAPSASRPATMPTRRSHAAVNAAPVAAVATATGGARGRQARVRQAIRESVLEALKMNAAQLQDDQAFMTYGVDSITGVALVNTINTRLGLRLPTTTLFDYSTIEQLSTHISMQYAVQLSDAELEPVAAVVSAPVMTEMAASPLSESALDTVPMPSLAAEVFRAAAPVPVWRPSPAEPVQVQPRLLPIVAPGPSGSGPTYHRVWLDRPGSIDEVRIVADSLSPLQPHEVRIAVHAFSLNFGDLLCIKGMYPTQPAYPFTPGFEASGEVVAIGVGVSSVAVGDAVMAIAGAELGAHATVLTCMEQQVYAMPRGLSFEAACAMPVVAVTMIECFTKARLKAGESILIQTATGGTGLVAVQLAQHAGARIYATAGSAAKLNYLAGLGIEHRINYLEQDFEAELMRLTGGRGVDVVINTLGGDAVQKGLNCLAPEGRYIEIAMTALKSAHAIDLSGLANNQTLHSIDLRKLGRTNPAALERGVREMTRLLEAGVISPVLSRIFDFEQVQDAYRWLEDRRNIGKVVVSVPLTYRYQAPDSGERIAIEPIAVIGMSGRFARAGDLQELWQALAGGEDLIEEASRWPLDALGPDQEPYCHHGGFLRDIDAFDPMFFNISGHEAAVMDPQQRIFLEEAWRALEDAGYAGASVEGRRCGVYVGCAAGDYQRLLERDAPAQAFWGNAGSLIPARIAYHLDLQGPAVAIDTACSSSLVALHQACQALRHGEAELALAGGVFVQSTEHFYLQANRAGMLSPRGRCHTFDARADGFVPGEGAGVVVLKRLSQALADGDLIHGVIKGSGINQDGATNGITAPSARSQTRLEREVYQRHGIDPQQIQVVEAHGTGTVLGDPIEYRALTEALLDGKPTGELGTRCAIGSIKSNLGHTAAAAGIAGVIKLLLALRHRRIPPSLHFEQGNTHIDFSRSPLYVPTTLEDWPASVGGQRLAAVSSFGFSGTNAHAVIGEAPATNRVLPRRPTYLVVLSARTQPQLRLQLERILAHLDGEVPPLASLSHTLLLGRRHFDCRWACLASDLKGLRAQLAEALQKETVGGRIGGADHVPPGEEQLDPLQRRMSDYTDMISHEAARDLLEALREAYLQGLPLDWSFLFQGDGWQRVGLPGYPFARERYWVPVRKTVANTEAALASEPLGQAGMSQGDAGETRTLTWVPQWRSRALVSDAQTAAADRHVVLCELDADPALVEGAVTVRQWTHEGSLDQRYAHYAEYLLTEIQTLAKHRSRRSVLLQLVVPARSEGAVLQALGGLLRTAEKEYSWLRTQLIAVDDVAHLSERLNEEATADPTSRVRYLGSSREVLSYVPAVPACGSAPVWREGGVYWITGGLGGLGLVFAEGIARQVRCPTLVLSGRRAPAPTQQARLERLGELGATVECHALDVSEAVAVAALAQSIVARHGRLSGVIHAAGVLRDGLLHNKRREDLQAVLAPKVAGLLTLERATAGLSLDWLLLCSSVAGVLGNLGQGDYAAANAFLDAYALYRQAPYDRVTRRTRLYSISWPLWEEGGMRIDTDTQAALWREAGVKAMPSEAGLQALNCVLTQDFAHALVMHGDARRLSQVVEGAMPEAVDEEKAATLQSNPADLKAKLEAELAGMIASHLQLPIEALSRDARISEFGFDSISLKAFLKLLNRRHGLALSPAVFFEEPSIRALAAYLLREHGEAFVAIESSAQATLEAPPEPPPVAATSSSTEREQQGGAKPTQREAIAVIGLDGYFPASTDLQEYWDNLWTGRDCITEVPARRWSLDEFYTEDVEVAIRQGRSYSKWGGFIRDIEALDPGFLSGVPAKARQQLNEEQKLFVGIVDRLLVTSGYTEQRLEALRCRRVGVYLGMTAERSAPTDATTSGRNDSPGTLAGMVSRMFRFNGPSVAVDAHSASSMTAVHMACNNLLHSECDAAIAGGVSLLYPDTYRDGCQISLLASHPESRSFSEDKDGVLLADGVGAVLLKRLSTAVEDGDRILAVIRSTVAQSVSSGLSDLPKPELVAASIRENFARATVDPRTISYVEAASAGFPIGDVIEMSATALAFRAYTDQRQFCALGSVKANIGHATAASGISQLAKVVLQLQNGRLAPSIKVGPEQVQAQLRKSPFYVQQQAEDWQRPRLSIDGNEEGREYPRRAMINSMGHGGFYAGAILEEYCGPVLEDQ", ], "9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6": [ - "9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6", - "9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6", + "AXA20097.1", + "AXA20097.1", set(), + "MSAEANKAIVTAMYEALNNRDAKGHFGHMADDVQVTYFGNHRFSRTFHGKQDLFENFTKHFMEYLEGPLDFRVGNIIATDDYAVIEGQGIGRTKDGQDYNNVYCIVMRLVEGKVTEIREYMDTDLAKRIFG", ], "MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm": [ - "MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm", - "MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm", + "AXA20098.1", + "AXA20098.1", set(), + "MKDLTQGAVTRHIVSMAVPIGVGAVFQSLYYLIDLYFVGCLGSDALAGVSAAGNLSLLVMALTQVLGAGTLALMAQAAGRKNEDQARGIFNQALVLSICSGLVLLLLGYALTGVYLRSTSATLVVAEQGQRYLYWFLPGMAFQFVLTAMASALRGIGVVKPTMMVQLMTVVLNIVLAPVLIVGWGTGYAMGVAGAGLASSIALAVAAPAMAWHFHQLGHYVQVRRELLRPRWADWRRLLTIGLPAGGEYTLLFLYSAATYWAIRDFGPAAQAGFGAAARLMQVLLLPALALSFATGPMVGQNLGAGMAERVRRTFGAAIWLISSIMLLACLLLLWEGETLLGRFATGGEVIEQGTHYLHIACWSFIAQGVIFTCSSVFQGLGNTRPALVSSVVRLLCFVLPVAWLSARGHFPIIAVWYLSLASIFLQALLSLWLVRQEFRLKLAPVAGLPSMQAIRR", ], "Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn": [ - "Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn", - "Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn", + "AXA20099.1", + "AXA20099.1", set(), + "MSSGYQPDAVQARGNPARGGTVFMFGGQGTQYFQMGRELYRSHPVFRERMDGCDALIRAELGYSLTEVLYEAGHTSSTPFDEILYTHPALFSIGHSLGEAIRADGIEPKAVLGYSLGEYIGLVAAGCLGWEGGLRLVIRQAQVLAQHGAAGGMLSVFAELEQFWQRPDLYTGSQLAVVNFSGSFCVAGAPEAMEAIKRRLDEEQRISTLLPIRFAFHSSGIDALEQRIRGLTADLRIKPGRVPAYSCMLRRAIGTAEFADPTAYCWRVVRDSVHFEQTARRLAAEIPVPFMVDLSATGTLATFIKYARIDGVSYSHCINQFGRDLNEYKSLMQRFAV", ], "nk3UJUyLaWr8LochtohJ9L6eugdChZL9": [ - "nk3UJUyLaWr8LochtohJ9L6eugdChZL9", - "nk3UJUyLaWr8LochtohJ9L6eugdChZL9", + "AXA20100.1", + "AXA20100.1", set(), + "MSSVATVPTTPSYTMRPLLHAVLFLRFRHARVPLIGQRIRLVRRGVMARLMQLPVTVVENRSKTVATTGGSLAENVVYRRFKTLTGNCFWGASHRLAGAQGHRSRRRNQPHGGPRSSIIRSYRLKLCP", ], "du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL": [ - "du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL", - "du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL", + "AXA20101.1", + "AXA20101.1", set(), + "MSAIPVHASPFLFELLTNALPDVYQSYTELRRLGGVVRDSSGVILLARHDDIVKTFADRRFTSVGRQSLASMSPALQEKMKGSRLFELLNFRDGDSHRQARRVLAGLFNPDTMEWIRVEIAAEAGVLMERWQTGEADDFVAAVTAQLPMRALAMLIGIEEVGLQDFFLRTRNFGAWLSASVFSEEGLSLVADEFVWARGWLRAQLVGKPLFDAVSDDTRENLLGDLLLVLVTGYDSSVALLGNGLATLVSVPALRDRLARDAGLSLRAAEELIRYDSPAQVAFRLALEDVTLGEHRIAKGEFVALVNGSGNRDETIYEHADQLNFDRPKQRALAFGSGPHACAGAALAKAQLTGFLDGVRPWLPHLTLDDATAPPSQHGLLRYRTHLMLRHTVV", ], "5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP": [ - "5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP", - "5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP", + "AXA20102.1", + "AXA20102.1", set(), + "MAYRSTIYLDWGTFFLIETPAASIQPLLPASVRPLLAASGAAILTLNVVHFLAGGEQVDLPANHEIDIGVLVELDNSEFEADLPQASVAIHVLKVASTARGYVDLCQNTGYRVIDPVALAFEINPATLTASVCDADGPILGCRQLDLDLEYDEFRRIGQDVMYDAERGIHRANYIFSGKGLSRPMTNAMELRVHAHPFFADLGIEPGVLVCLDQFALKPSSRSSLAFYRPNQVEMDETVQAEKD", ], "iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L": [ - "iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L", - "iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L", + "AXA20103.1", + "AXA20103.1", set(), + "MSTRMFITGGACGLGRALAQRYARDGASVCIGDVDDAAGRQVVEALLAEGATAHYLHCDVTREPQLHEAANWLRTQWGGVDIVINNAGVAQIGGITESSLEDWQWAVDINLLGVVRGCKAFIPLLQAQGGGKLLNVASIAGLLYMPKSGGYNATKAAVIALSETLQLELHDSGIQVSVACPAYFRTDLARNMRASNAQLRQRTHNLVEQARLGSQEVAELIHAGLARGDTHILTHPATRIAWRLKRWLPYRWYLGIARKQIAKAGTAAEDAPA", ], "WbViYzQw8y-XfCQMgQXkedGduNMJPa14": [ - "WbViYzQw8y-XfCQMgQXkedGduNMJPa14", - "WbViYzQw8y-XfCQMgQXkedGduNMJPa14", + "AXA20104.1", + "AXA20104.1", set(), + "MSFTLQGIPVSRGISIGRAYLIAPAALDVAHYLVEAQQLEAEIERLRAALLAVRGELDVLRSDLTEDTPSEVAAFIDVHSMILNDALLVQETIDLIRVRRYNAEWALTKQLDVISGHFDDIEDEYLRERKADIEQVVERVLKALAGEPSASQALGGATGGKDEMIVVAHDIAPTDMIQFKNQAFHAFVTNLGGHTSHTAIVARSLGIPALVGVQHASALIRQNDLIIVDGERGIVIVDPAPIVLEEYSYRQSEMALEQHKLQRLKFSPTQTLCGTQIDLMANIELPDDTKTAVEAGAVGVGLFRTEFLFMNDANALAEEEEQFRAYRRTVEMMNGKSVTIRTIDVGADKPREEGYETAPNPALGLRAIRWSLSAPQMFLTQVRAILRASSVGPVKILVPMLAHAQEIDQTLNLIREAKRQLDDAGLAYDPNVRVGAMIEIPAAAIALPLFLKRVDFLSIGTNDLIQYTLAIDRADNTVAHLYDPLHPAVLHLISGTLRAAKRAGVPVSVCGEMAGDPAMTRLLLGMGLTEFSMHPSQLLVVKQEILRAHLKAIEKSTADVLASYEPEDVQLALKQLAAAKPKADLAA", ], "4_8182J88axMDpFJBZI6kLNJAu8Ittm3": [ - "4_8182J88axMDpFJBZI6kLNJAu8Ittm3", - "4_8182J88axMDpFJBZI6kLNJAu8Ittm3", + "AXA20105.1", + "AXA20105.1", set(), + "MIVPMKNSDLSLPSKLDAAILRGRDALAQRQNADGSWRFELESDATITAEYILMMHFIGKIDDVRQARMARYLREIQRLAAHGGWDLYVDGAPDVSASVKAYFALKAAGDSEDAPHMARARETILHLGGAARCNVFTRILLVTFGQVPWRATPFMPIEFVLFPKWVPISIYKVAYWARTTMVPLLVLCSLKARAKNPRGISIRELFVTAPEEERNYFARGGFIRNLFLYIDRTLCSLDALIPKALRRRAIRHAESWCAERMNGEDGMGGIFPPIVYSYQMMEVLGYAEDHPLRRACEDALEKLLVERPDGSMYCQPCLSPVWDTAWATMALEQARAVPDTREQPTVSAAQLQVGITRACDWLAGKQVTELKGDWIENAPTETPAGGWAFQYENPYYPDVDDSAVVAAMLHQRGCAMARLTGTDPYTEVVSRGLDWIRGLQSRNGGFGAFDADCDRLYLNLIPFADHGALLDPPTEDVSGRVLLCFGVTGRAEDRSALARAIEYVKRTQREDGSWWGRWGTNYIYGTWSVLAGLALAGEHCSQPYIARAIDWLCARQNADGGWGETNDSYVDPSLAGTNGGESASNFTAWALLAQMAFGEWQSESVQRGIRYLLSVQQADGFWWHRSHNAPGFPRIYYLKYHGYTAYFPLWALARYRCLSQAHVATSASPAAEARRGVVL", ], "IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte": [ - "IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte", - "IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte", + "AXA20106.1", + "AXA20106.1", set(), + "MNFDDYCQKKAAPPGSSIYYALRQAPLTSQGALIALFALRRELEEATRETSEPAIGQTKLAWWRKELAALAEGQPSHPVSQALALHASAIASDHAVLQALADGFAMDLEQTRYLDWPNLRRYVERVGGGFAGLVARATTGPAIPADIAPAWAASLGEGLQLAQIVEDVGDDLRHGRVYLPFDELQRYEVTSVDLMHRRYSPAFRKLMRFQVTRARETLHAALEAIPAAELPAQRSLRALAALALATLDEIEGEDYQVLHQRILLTPIRKLWVAWRAAQRRY", ], "ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x": [ - "ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x", - "ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x", + "AXA20107.1", + "AXA20107.1", set(), + "MASMGIISGNTNGLRVSEVRHTVDDADANGDLGRLRGDVTRPETVAGEGLEPIHRILGKRSPVVATFLLPFSTTVTGNCINRAIMPRRTGHIRWPMNDTLAWRNRRNSTACSNGRMAWLGVVGTITANDIDLFFAWNLVEQLGQSITVSHILIRH", ], } assert {k: v.sequence for k, v in sg_object.proteins.items()} == PROTEIN_DICT diff --git a/tests/python/classes/test_mibig_gbk_parser.py b/tests/python/classes/test_mibig_gbk_parser.py index 15cac39b..5c1b7c9e 100644 --- a/tests/python/classes/test_mibig_gbk_parser.py +++ b/tests/python/classes/test_mibig_gbk_parser.py @@ -77,8 +77,8 @@ def test_loci_parent(): sg_object = SocialGene() gbk_path = os.path.join(FIXTURE_DIR, "lagriamide_mibig_bgc0001946.gbk") sg_object.parse(gbk_path) - assert isinstance(sg_object.assemblies["lagriamide_mibig_bgc0001946"].loci["BGC0001946.1"].parent_object, molbio.Assembly) - assert sg_object.assemblies["lagriamide_mibig_bgc0001946"].loci["BGC0001946.1"].parent_object.uid == "lagriamide_mibig_bgc0001946" + assert isinstance(sg_object.assemblies["lagriamide_mibig_bgc0001946"].loci["BGC0001946.1"].parent, molbio.Assembly) + assert sg_object.assemblies["lagriamide_mibig_bgc0001946"].loci["BGC0001946.1"].parent.uid == "lagriamide_mibig_bgc0001946" def test_loci_features_sorted_by_midpoint(): @@ -117,8 +117,11 @@ def test_parse_features(): ) # fmt: off temp = {f"{i.uid}_{i.start}": i.all_attributes() for i in sg_object.assemblies['lagriamide_mibig_bgc0001946'].loci['BGC0001946.1'].features} - assert str(type(temp['5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP_92083']['parent_object'])) == "" + assert str(type(temp['5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP_92083']['parent'])) == "" for k in temp.keys(): - del temp[k]['parent_object'] + assert isinstance(temp[k]['parent'], molbio.Locus) + del temp[k]['parent'] + assert isinstance(temp[k]['protein'], molbio.Protein) + del temp[k]['protein'] assert temp == {'-l7xLyFZbiZENPLq_GML8JyTRF1Srawr_2205': {'description': 'transposase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': '-l7xLyFZbiZENPLq_GML8JyTRF1Srawr', 'external_id': 'AXA20088.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm_87147': {'description': 'MATE family efflux transporter LgaH', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'MSHRSCZfdBJP8vdJdaXfeZrThH_4EUMm', 'external_id': 'AXA20098.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, '5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP_92083': {'description': 'LgaM', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': '5IYMhENey2WCMrKPUz3AqBIZuFSv6DPP', 'external_id': 'AXA20102.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn_58398': {'description': 'enoylreductase LgaF', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'mB22-i4RqtslyO7_HappM4rJ4Z2Qbkfn', 'external_id': 'AXA20095.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl_1190': {'description': 'sigma-70 RpoE', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'Tdc2m3PRLsyEzjwyux6BF4arDy2mQ_Bl', 'external_id': 'AXA20086.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn_1915': {'description': 'competence protein ComEC', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'ptq1NGhBcUp3TIEqvAUxnnp4LOKwINvn', 'external_id': 'AXA20087.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte_98026': {'description': 'all-trans-phytoene synthase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'IRqRpDzrGB9UhHJD6AzDq_6Xupj00Nte', 'external_id': 'AXA20106.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'WbViYzQw8y-XfCQMgQXkedGduNMJPa14_93904': {'description': 'phosphoenolpyruvate-protein phosphotransferase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'WbViYzQw8y-XfCQMgQXkedGduNMJPa14', 'external_id': 'AXA20104.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, '9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6_86717': {'description': 'nuclear transport factor 2 (NTF2)-like protein LgaL', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': '9stFB1fGjCdZVZWHLVI3OD4A_DV3WcV6', 'external_id': 'AXA20097.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn_88580': {'description': 'acylhydrolase LgaI', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'Qi23auOUTcBzWTmDHuGinrzIuqH7-zVn', 'external_id': 'AXA20099.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ_13790': {'description': 'hybrid trans-AT PKS/NRPS LgaB', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'IsCrCflKZgA6ghoHxXclbsOix0bbDkwZ', 'external_id': 'AXA20091.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0_2828': {'description': 'hypothetical protein', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'T_DzOorDp3ROhRRBtuXP3xyAPorpTVD0', 'external_id': 'AXA20089.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L_92859': {'description': 'ketoreductase LgaK', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'iiWqYfcbDGjauCrUsdiI1pAlG5Syx_-L', 'external_id': 'AXA20103.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'AStsOnOU5ZWxURs9PrTiWjddkuQXfanl_3132': {'description': 'hybrid trans-AT PKS/NRPS LgaA', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'AStsOnOU5ZWxURs9PrTiWjddkuQXfanl', 'external_id': 'AXA20090.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'DTee9G4M8sEfnM4HaPfI37rT74pq7M_G_57289': {'description': 'acyltransferase LgaE', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'DTee9G4M8sEfnM4HaPfI37rT74pq7M_G', 'external_id': 'AXA20094.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL_90836': {'description': 'cytochrome P450 LgaJ', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'du1Ncfm5UYiFYgDWD8KW1AQJNHlAcVXL', 'external_id': 'AXA20101.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'nk3UJUyLaWr8LochtohJ9L6eugdChZL9_90318': {'description': 'transposase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'nk3UJUyLaWr8LochtohJ9L6eugdChZL9', 'external_id': 'AXA20100.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh_59909': {'description': 'trans-AT PKS LgaG', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh', 'external_id': 'AXA20096.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5_33004': {'description': 'trans-AT PKS LgaC', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5', 'external_id': 'AXA20092.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, '4_8182J88axMDpFJBZI6kLNJAu8Ittm3_95865': {'description': 'squalene--hopene cyclase', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': '4_8182J88axMDpFJBZI6kLNJAu8Ittm3', 'external_id': 'AXA20105.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'RyDIaUZc_b21_kQalx7J3yNO4l5f-439_53400': {'description': 'trans-AT PKS LgaD', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'RyDIaUZc_b21_kQalx7J3yNO4l5f-439', 'external_id': 'AXA20093.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}, 'ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x_98927': {'description': 'hypothetical protein', 'frameshifted': None, 'goterms': None, 'incomplete': None, 'internal_stop': None, 'locus_tag': None, 'missing_C_terminus': None, 'missing_N_terminus': None, 'missing_start': None, 'missing_stop': None, 'note': None, 'partial_in_the_middle_of_a_contig': None, 'partial_on_complete_genome': None, 'uid': 'ewjVum5PbpEJA4rl-BfnCAypKl5HXb7x', 'external_id': 'AXA20107.1', 'too_short_partial_abutting_assembly_gap': None, 'type': 'CDS'}} diff --git a/tests/python/classes/test_socialgene.py b/tests/python/classes/test_socialgene.py index 4d7fb32d..70d72fd3 100644 --- a/tests/python/classes/test_socialgene.py +++ b/tests/python/classes/test_socialgene.py @@ -13,7 +13,7 @@ ) -temp.add_assembly("myassembly") +temp.add_assembly(uid="myassembly", parent=temp) temp.assemblies["myassembly"].add_locus( external_id="my_locus", ) diff --git a/tests/python/cli/test_export_protein_loci_assembly_tables.py b/tests/python/cli/test_export_protein_loci_assembly_tables.py index 47a70abd..48764c30 100644 --- a/tests/python/cli/test_export_protein_loci_assembly_tables.py +++ b/tests/python/cli/test_export_protein_loci_assembly_tables.py @@ -35,9 +35,8 @@ "loci", "locus_to_protein", "protein_ids", - # "protein_info", ] -hash_algo = ["crc64", "sha512t24u"] # , "sha256"] +hash_algo = ["crc64", "sha512t24u"] include_sequences = ["True", "False"] @@ -72,7 +71,7 @@ def test_gbk_parsing_and_flatfile_for_neo4j_creation_collect_tables_in_memory_fa # DO NOT REMOVE, used to create truth files def create_files( outdir="/tmp/tempsg", - tg="/home/chase/Documents/github/kwan_lab/socialgene/sgpy/tests/python/data/test_genomes", + tg="./tests/python/data/test_genomes", ): from pathlib import Path diff --git a/tests/python/compare_proteins/compare_proteins.py b/tests/python/compare_proteins/compare_proteins.py new file mode 100644 index 00000000..5a6d0097 --- /dev/null +++ b/tests/python/compare_proteins/compare_proteins.py @@ -0,0 +1,48 @@ +import os +from pathlib import Path + +import pandas as pd + +from socialgene.base.socialgene import SocialGene +from socialgene.compare_proteins.base import BlastTab_COLUMNS +from socialgene.compare_proteins.diamond import DiamondBlastp +from socialgene.compare_proteins.hmmer import CompareDomains +from socialgene.compare_proteins.mmseqs import MMseqsEasySearch + +FIXTURE_DIR = os.path.dirname(os.path.realpath(__file__)) +FIXTURE_DIR = os.path.dirname(FIXTURE_DIR) +FIXTURE_DIR = os.path.join(FIXTURE_DIR, "data", "compare_proteins") + +sg_1848 = os.path.join(FIXTURE_DIR, "BGC0001848.pickle") +sg_1850 = os.path.join(FIXTURE_DIR, "BGC0001850.pickle") + + +def test_DiamondBlastp_compare_proteins_dataframe(): + a1 = SocialGene().eat_pickle(sg_1848) + a2 = SocialGene().eat_pickle(sg_1850) + z1 = DiamondBlastp() + z1 = z1.compare_proteins(a1.protein_list, a2.protein_list) + z2 = pd.read_csv( + Path(FIXTURE_DIR, "test_DiamondBlastp.csv"), dtype=BlastTab_COLUMNS + ) + pd.testing.assert_frame_equal(z1, z2, check_names=False) + + +def test_MMseqsEasySearch_compare_proteins_dataframe(): + a1 = SocialGene().eat_pickle(sg_1848) + a2 = SocialGene().eat_pickle(sg_1850) + z1 = MMseqsEasySearch() + z1 = z1.compare_proteins(a1.protein_list, a2.protein_list) + z2 = pd.read_csv( + Path(FIXTURE_DIR, "test_MMseqsEasySearch.csv"), dtype=BlastTab_COLUMNS + ) + pd.testing.assert_frame_equal(z1, z2, check_names=False) + + +def test_CompareDomains_compare_proteins_dataframe(): + a1 = SocialGene().eat_pickle(sg_1848) + a2 = SocialGene().eat_pickle(sg_1850) + z1 = CompareDomains() + z1 = z1.compare_proteins(a1.protein_list, a2.protein_list) + z2 = pd.read_csv(Path(FIXTURE_DIR, "test_CompareDomains.csv")) + pd.testing.assert_frame_equal(z1, z2, check_names=False) diff --git a/tests/python/compare_proteins/hmm/hmmer.py b/tests/python/compare_proteins/hmm/hmmer.py index 3d8bedb0..e482450a 100644 --- a/tests/python/compare_proteins/hmm/hmmer.py +++ b/tests/python/compare_proteins/hmm/hmmer.py @@ -3,7 +3,7 @@ import pandas as pd from socialgene.base.socialgene import SocialGene -from socialgene.compare_proteins.hmm.hmmer import CompareDomains +from socialgene.compare_proteins.hmmer import CompareDomains FIXTURE_DIR = os.path.dirname(os.path.realpath(__file__)) FIXTURE_DIR = os.path.dirname(FIXTURE_DIR) @@ -49,7 +49,7 @@ def test_CompareDomains_compare_one_to_many(): a = CompareDomains() p1 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] p2 = list(sg_object.proteins.values()) - a.compare_one_to_many(p1, p2) + df = a.compare_one_to_many(p1, p2) expected = pd.DataFrame( { "query": { @@ -71,7 +71,7 @@ def test_CompareDomains_compare_one_to_many(): "score": {0: 1.5, 1: 1.14, 2: 0.78, 3: 0.6, 4: 0.52, 5: 0.18}, } ) - pd.testing.assert_frame_equal(a.df, expected) + pd.testing.assert_frame_equal(df, expected) a.compare_one_to_many(p1, p2, filter=False) expected = pd.DataFrame( { @@ -161,7 +161,7 @@ def test_CompareDomains_compare_many_to_many(): a = CompareDomains() p1 = list(sg_object.proteins.values()) p2 = list(sg_object.proteins.values()) - a.compare_many_to_many(p1, p2) + df = a.compare_many_to_many(p1, p2) expected = pd.DataFrame( { "query": { @@ -335,14 +335,14 @@ def test_CompareDomains_compare_many_to_many(): } ) pd.testing.assert_frame_equal( - a.df.sort_values(["query", "target", "score"], ignore_index=True), + df.sort_values(["query", "target", "score"], ignore_index=True), expected.sort_values(["query", "target", "score"], ignore_index=True), ) - a.compare_many_to_many(p1, p2, filter=False) + df = a.compare_many_to_many(p1, p2, filter=False) # 22 inputs * 22 inputs = 484 - assert len(a.df) == 484 + assert len(df) == 484 pd.testing.assert_frame_equal( - pd.DataFrame(a.df["score"].value_counts()).reset_index(), + pd.DataFrame(df["score"].value_counts()).reset_index(), pd.DataFrame( { "score": { diff --git a/tests/python/compare_proteins/hmm/test_scoring.py b/tests/python/compare_proteins/hmm/test_scoring.py index 0203dfd7..65db32d5 100644 --- a/tests/python/compare_proteins/hmm/test_scoring.py +++ b/tests/python/compare_proteins/hmm/test_scoring.py @@ -3,7 +3,7 @@ import pytest from socialgene.base.socialgene import SocialGene -from socialgene.compare_proteins.hmm.scoring import _mod_score_tupler, mod_score +from socialgene.compare_proteins.hmm_scoring import _mod_score_tupler, mod_score FIXTURE_DIR = os.path.dirname(os.path.realpath(__file__)) FIXTURE_DIR = os.path.dirname(FIXTURE_DIR) @@ -33,7 +33,7 @@ def test_create_tuple_type(): a = _mod_score_tupler(*[i for i in range(7)]) assert ( str(type(a)) - == "" + == "" ) @@ -74,15 +74,13 @@ def test_mod_score(): p1 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] p2 = sg_object.proteins["iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh"] res = mod_score(p1, p2) - assert res._asdict() == { - "query": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "target": "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh", - "query_n_domains": 31, - "target_n_domains": 40, - "levenshtein": 0.72, - "jaccard": 1, - "mod_score": 1.23, - } + assert res.jaccard == 1 + assert res.levenshtein == 0.72 + assert res.mod_score == 1.23 + assert res.query_n_domains == 31 + assert res.target_n_domains == 40 + assert res.query == p1 + assert res.target == p2 def test_mod_score_no_domains(): @@ -91,15 +89,13 @@ def test_mod_score_no_domains(): p1 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] p2 = sg_object.proteins["iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh"] res = mod_score(p1, p2) - assert res._asdict() == { - "query": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "target": "iI7aI2dI9vaha9f0rVTi_YFrfMXjY1eh", - "query_n_domains": 0, - "target_n_domains": 0, - "levenshtein": 100, - "jaccard": 0, - "mod_score": 0, - } + assert res.jaccard == 0 + assert res.levenshtein == 100 + assert res.mod_score == 0 + assert res.query_n_domains == 0 + assert res.target_n_domains == 0 + assert res.query == p1 + assert res.target == p2 def test_mod_score_same_hash_with_domains(): @@ -119,15 +115,13 @@ def test_mod_score_same_hash_with_domains(): cpus=1, ) res = mod_score(p1, p2) - assert res._asdict() == { - "query": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "target": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "query_n_domains": 31, - "target_n_domains": 31, - "levenshtein": 0, - "jaccard": 1, - "mod_score": 1.5, - } + assert res.jaccard == 1 + assert res.levenshtein == 0 + assert res.mod_score == 1.5 + assert res.query_n_domains == 31 + assert res.target_n_domains == 31 + assert res.query == p1 + assert res.target == p2 def test_mod_score_same_hash_no_domains(): @@ -136,15 +130,13 @@ def test_mod_score_same_hash_no_domains(): p1 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] p2 = sg_object.proteins["Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5"] res = mod_score(p1, p2) - assert res._asdict() == { - "query": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "target": "Ia6RrYNflQpEjxBCKTb5azk9_FTDvB-5", - "query_n_domains": 0, - "target_n_domains": 0, - "levenshtein": 0, - "jaccard": 1, - "mod_score": 1.5, - } + assert res.jaccard == 1 + assert res.levenshtein == 0 + assert res.mod_score == 1.5 + assert res.query_n_domains == 0 + assert res.target_n_domains == 0 + assert res.query == p1 + assert res.target == p2 def test_mod_score_input_error(): diff --git a/tests/python/data/compare_proteins/BGC0001848.pickle b/tests/python/data/compare_proteins/BGC0001848.pickle new file mode 100644 index 00000000..066378f0 Binary files /dev/null and b/tests/python/data/compare_proteins/BGC0001848.pickle differ diff --git a/tests/python/data/compare_proteins/BGC0001850.pickle b/tests/python/data/compare_proteins/BGC0001850.pickle new file mode 100644 index 00000000..123e81af Binary files /dev/null and b/tests/python/data/compare_proteins/BGC0001850.pickle differ diff --git a/tests/python/data/compare_proteins/test_CompareDomains.csv b/tests/python/data/compare_proteins/test_CompareDomains.csv new file mode 100644 index 00000000..451e0b0e --- /dev/null +++ b/tests/python/data/compare_proteins/test_CompareDomains.csv @@ -0,0 +1,33 @@ +query,target,query_n_domains,target_n_domains,jaccard,mod_score +qEcFCOXPHpf_D6FiGPea8DkV_AWWZjMy,uG-Um--bm_FHCq-iy0uo8sHib58ReCw_,2,2,1.0,1.5 +0suO2ImDjlLb-G5pYR1evP6CDaM4Kmma,4lkDnHjbCLe38TRMATM7Q7stnchNYbGt,2,3,0.67,1.0 +_0OhI8CwFqgCa97qVet-jbUA7WN2Q3XF,fkNtoGfCrQQfBSksSqd7Gjv5RdiQH8Ia,4,4,0.14,0.32 +_0OhI8CwFqgCa97qVet-jbUA7WN2Q3XF,nEdlX-kJdaVkOWtt0Hd9Ccv828Sjwjah,4,3,0.17,0.08 +_0OhI8CwFqgCa97qVet-jbUA7WN2Q3XF,52NDrUtW4VpoW_5metaigrwcSiNy4nLt,4,5,0.8,0.8 +UpM8jOJT2xvzuU9Nbq6foihOdCQg0JR0,4nWmfQ-f7mG0sB3KZqC1b81jBf2xJBik,1,1,1.0,1.5 +KbJwxUWA9hBamAnz7boyic9xuL7YpdFp,fkNtoGfCrQQfBSksSqd7Gjv5RdiQH8Ia,5,4,0.12,0.06 +KbJwxUWA9hBamAnz7boyic9xuL7YpdFp,nEdlX-kJdaVkOWtt0Hd9Ccv828Sjwjah,5,3,0.14,0.27 +KbJwxUWA9hBamAnz7boyic9xuL7YpdFp,52NDrUtW4VpoW_5metaigrwcSiNy4nLt,5,5,1.0,1.5 +abYaRpVeBw6U9ZaXiEMND4z-J7_bnwBW,4lkDnHjbCLe38TRMATM7Q7stnchNYbGt,3,3,0.5,0.92 +oS3H0Kb3lOQz6-wnh14vTmcPhT2_FDT_,_xvHFg1H1f43WxPcZT1P8Qbcx60chSxh,1,1,1.0,1.5 +0yVaZrn7OtVTYy5bfiHLRUYCRvmhh5QF,KCRfjiLoK7SiZW5lvuqxj5cL71La0pHA,1,1,1.0,1.5 +HiEbAmsTAFh_WLCbMsjHa7rYMmA3aAVn,RNafvyyc5PZHMVgzF8FkDrHTqXs9010E,2,2,1.0,1.5 +_bM-9OJARu4IC1V8jchTXDIaTPuLUhCe,kUiC97nMN3p9XGQznxIcoDXjYFjcjeOD,2,2,1.0,1.5 +etULepIwK3A8cTlCpg7R-CAWJISYYy0t,WnKuODuGjL34CVAQ7kvTIeuaZOBeUZmA,1,1,1.0,1.5 +D8hihmbo9jMJDf3zV2YsgAKSTgxHwc05,35es2QbNXZvIz8lXRZjAgPOdp_9BZMxk,2,2,1.0,1.5 +uNrCjGQbe9LsL9umHQWqvG2OQykvFxFw,fkNtoGfCrQQfBSksSqd7Gjv5RdiQH8Ia,3,4,0.75,0.88 +uNrCjGQbe9LsL9umHQWqvG2OQykvFxFw,nEdlX-kJdaVkOWtt0Hd9Ccv828Sjwjah,3,3,1.0,0.83 +uNrCjGQbe9LsL9umHQWqvG2OQykvFxFw,52NDrUtW4VpoW_5metaigrwcSiNy4nLt,3,5,0.14,0.27 +M-VPo39ep4Utq6_Fru4JuNM9rp3Wmu45,tzmklOd_BQ4zCXJ8OiXEoLGyjQ3LYUuD,1,1,1.0,1.5 +uYr-bewprIg6Swc5EWnzKCpQiTuVowir,1WwuCuwJXaYDZN9b47_IRfrTu8BwWEKe,2,3,0.67,1.0 +TT9-VE-A9iUNQx8UtgoRlcKHXw521wxV,KCRfjiLoK7SiZW5lvuqxj5cL71La0pHA,1,1,1.0,1.5 +ennTHgEojAJicxoKyyb-1MWJz5q0YNGv,wNs7ekpuD-RDwVfiqLO0eg2U6NjQpHg-,1,1,1.0,1.5 +ennTHgEojAJicxoKyyb-1MWJz5q0YNGv,90OhAlVQq5BrRFnV44GitGEpepY_zZEr,1,1,1.0,1.5 +BIkruFDUoKjckgk0j6oZUrBk_oAj16YG,wNs7ekpuD-RDwVfiqLO0eg2U6NjQpHg-,1,1,1.0,1.5 +BIkruFDUoKjckgk0j6oZUrBk_oAj16YG,90OhAlVQq5BrRFnV44GitGEpepY_zZEr,1,1,1.0,1.5 +IU-V8ZvWVd1C1Z-evwLfP_KkJT7mZCvA,AAPVmUvcWZtYkxwRx0EV64-OBFtL8HOd,3,3,1.0,1.5 +orV-86yt1TbEwWGDAkU3bVNpe459Hl1B,LJkzeA6-YkPu5alDDy-m0BbXYw4SKboZ,2,1,0.5,0.75 +4EeqbFY1V20e1Z8RNjccxJsJaspnPGB1,_xvHFg1H1f43WxPcZT1P8Qbcx60chSxh,1,1,1.0,1.5 +U3Cgn6zYdFSQVEZFTRUnAo1zaJfp9-Q4,4lkDnHjbCLe38TRMATM7Q7stnchNYbGt,4,3,0.75,1.12 +lsxvnXk8kxp2OelIap6TcoguSBVOEMnk,4J1c7flqmFXPi51mMS_00l66F8wcABmX,1,1,1.0,1.5 +M-XQN-LZbGTFgj0rp2dFnIsF9bQm9chC,0v6Ldmv9eeUG6igtKY-RTxemiOPO-iG9,2,3,0.67,1.0 diff --git a/tests/python/data/compare_proteins/test_DiamondBlastp.csv b/tests/python/data/compare_proteins/test_DiamondBlastp.csv new file mode 100644 index 00000000..77a98103 --- /dev/null +++ b/tests/python/data/compare_proteins/test_DiamondBlastp.csv @@ -0,0 +1,27 @@ +query,target,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qlen,slen +qEcFCOXPHpf_D6FiGPea8DkV_AWWZjMy,uG-Um--bm_FHCq-iy0uo8sHib58ReCw_,47.8,381,194,4,2,381,169,545,3.32e-115,338.0,405,547 +0suO2ImDjlLb-G5pYR1evP6CDaM4Kmma,4lkDnHjbCLe38TRMATM7Q7stnchNYbGt,27.0,367,233,12,7,362,5,347,1.77e-17,74.3,532,396 +_0OhI8CwFqgCa97qVet-jbUA7WN2Q3XF,52NDrUtW4VpoW_5metaigrwcSiNy4nLt,25.9,251,157,10,8,232,7,254,1.59e-07,40.8,233,256 +UpM8jOJT2xvzuU9Nbq6foihOdCQg0JR0,4nWmfQ-f7mG0sB3KZqC1b81jBf2xJBik,55.9,204,86,2,5,207,20,220,2.03e-70,206.0,224,221 +KbJwxUWA9hBamAnz7boyic9xuL7YpdFp,52NDrUtW4VpoW_5metaigrwcSiNy4nLt,62.2,254,96,0,6,259,1,254,8.79e-102,288.0,261,256 +abYaRpVeBw6U9ZaXiEMND4z-J7_bnwBW,4lkDnHjbCLe38TRMATM7Q7stnchNYbGt,39.6,389,209,10,3,376,8,385,7.93e-68,211.0,389,396 +HiEbAmsTAFh_WLCbMsjHa7rYMmA3aAVn,RNafvyyc5PZHMVgzF8FkDrHTqXs9010E,57.5,623,242,6,18,635,17,621,3.13e-236,659.0,654,621 +Ft0znrFotJAJw4O3UQcsSde2fg7joffB,0v6Ldmv9eeUG6igtKY-RTxemiOPO-iG9,25.0,244,159,9,42,277,36,263,0.00014,33.1,360,460 +_bM-9OJARu4IC1V8jchTXDIaTPuLUhCe,kUiC97nMN3p9XGQznxIcoDXjYFjcjeOD,51.5,130,62,1,1,129,1,130,3.96e-46,137.0,138,134 +etULepIwK3A8cTlCpg7R-CAWJISYYy0t,WnKuODuGjL34CVAQ7kvTIeuaZOBeUZmA,74.1,220,57,0,1,220,1,220,3.43e-115,319.0,222,226 +D8hihmbo9jMJDf3zV2YsgAKSTgxHwc05,35es2QbNXZvIz8lXRZjAgPOdp_9BZMxk,59.2,218,86,2,5,219,31,248,2.02e-90,258.0,225,253 +uNrCjGQbe9LsL9umHQWqvG2OQykvFxFw,fkNtoGfCrQQfBSksSqd7Gjv5RdiQH8Ia,65.2,230,80,0,1,230,1,230,2.36e-110,308.0,231,234 +uNrCjGQbe9LsL9umHQWqvG2OQykvFxFw,nEdlX-kJdaVkOWtt0Hd9Ccv828Sjwjah,48.8,211,105,1,1,211,1,208,1.66e-59,178.0,231,215 +M-VPo39ep4Utq6_Fru4JuNM9rp3Wmu45,tzmklOd_BQ4zCXJ8OiXEoLGyjQ3LYUuD,43.8,361,177,8,18,354,1,359,5.13e-83,248.0,364,376 +uYr-bewprIg6Swc5EWnzKCpQiTuVowir,1WwuCuwJXaYDZN9b47_IRfrTu8BwWEKe,64.8,321,113,0,1,321,13,333,8.58e-146,405.0,322,334 +TT9-VE-A9iUNQx8UtgoRlcKHXw521wxV,KCRfjiLoK7SiZW5lvuqxj5cL71La0pHA,61.6,271,102,2,1,269,78,348,2.11e-116,329.0,274,353 +ennTHgEojAJicxoKyyb-1MWJz5q0YNGv,90OhAlVQq5BrRFnV44GitGEpepY_zZEr,54.1,270,123,1,1,269,1,270,3.7399999999999998e-106,300.0,269,270 +ennTHgEojAJicxoKyyb-1MWJz5q0YNGv,wNs7ekpuD-RDwVfiqLO0eg2U6NjQpHg-,51.9,268,128,1,1,267,1,268,3.5099999999999996e-104,295.0,269,270 +BIkruFDUoKjckgk0j6oZUrBk_oAj16YG,wNs7ekpuD-RDwVfiqLO0eg2U6NjQpHg-,56.9,269,113,2,1,268,1,267,4.6e-112,315.0,279,270 +BIkruFDUoKjckgk0j6oZUrBk_oAj16YG,90OhAlVQq5BrRFnV44GitGEpepY_zZEr,53.9,269,121,2,1,268,1,267,3.59e-104,295.0,279,270 +IU-V8ZvWVd1C1Z-evwLfP_KkJT7mZCvA,AAPVmUvcWZtYkxwRx0EV64-OBFtL8HOd,58.5,325,134,1,4,327,1,325,6.6e-122,345.0,350,328 +orV-86yt1TbEwWGDAkU3bVNpe459Hl1B,LJkzeA6-YkPu5alDDy-m0BbXYw4SKboZ,57.9,171,72,0,7,177,20,190,2.43e-68,197.0,180,192 +4EeqbFY1V20e1Z8RNjccxJsJaspnPGB1,_xvHFg1H1f43WxPcZT1P8Qbcx60chSxh,68.6,516,161,1,1,515,1,516,2.22e-258,706.0,523,523 +U3Cgn6zYdFSQVEZFTRUnAo1zaJfp9-Q4,4lkDnHjbCLe38TRMATM7Q7stnchNYbGt,61.1,388,150,1,8,394,3,390,4.2200000000000005e-160,447.0,402,396 +lsxvnXk8kxp2OelIap6TcoguSBVOEMnk,4J1c7flqmFXPi51mMS_00l66F8wcABmX,70.5,633,184,2,6,637,5,635,3.54e-316,863.0,657,649 +M-XQN-LZbGTFgj0rp2dFnIsF9bQm9chC,0v6Ldmv9eeUG6igtKY-RTxemiOPO-iG9,64.3,462,158,4,6,466,3,458,2.1e-182,509.0,474,460 diff --git a/tests/python/data/compare_proteins/test_MMseqsEasySearch.csv b/tests/python/data/compare_proteins/test_MMseqsEasySearch.csv new file mode 100644 index 00000000..9f5221f0 --- /dev/null +++ b/tests/python/data/compare_proteins/test_MMseqsEasySearch.csv @@ -0,0 +1,25 @@ +query,target,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qlen,slen +qEcFCOXPHpf_D6FiGPea8DkV_AWWZjMy,uG-Um--bm_FHCq-iy0uo8sHib58ReCw_,47.7,380,197,0,2,381,169,545,5.862e-108,344.0,405,547 +_bM-9OJARu4IC1V8jchTXDIaTPuLUhCe,kUiC97nMN3p9XGQznxIcoDXjYFjcjeOD,51.5,130,62,0,1,129,1,130,5.271e-42,140.0,138,134 +etULepIwK3A8cTlCpg7R-CAWJISYYy0t,WnKuODuGjL34CVAQ7kvTIeuaZOBeUZmA,74.0,220,57,0,1,220,1,220,5.102e-105,326.0,222,226 +D8hihmbo9jMJDf3zV2YsgAKSTgxHwc05,35es2QbNXZvIz8lXRZjAgPOdp_9BZMxk,60.2,218,85,0,5,219,31,248,2.683e-84,267.0,225,253 +_0OhI8CwFqgCa97qVet-jbUA7WN2Q3XF,52NDrUtW4VpoW_5metaigrwcSiNy4nLt,25.8,248,166,0,8,232,7,254,1.555e-08,45.0,233,256 +uNrCjGQbe9LsL9umHQWqvG2OQykvFxFw,fkNtoGfCrQQfBSksSqd7Gjv5RdiQH8Ia,65.2,230,80,0,1,230,1,230,2.513e-100,313.0,231,234 +uNrCjGQbe9LsL9umHQWqvG2OQykvFxFw,nEdlX-kJdaVkOWtt0Hd9Ccv828Sjwjah,48.8,211,106,0,1,211,1,208,1.498e-54,181.0,231,215 +UpM8jOJT2xvzuU9Nbq6foihOdCQg0JR0,4nWmfQ-f7mG0sB3KZqC1b81jBf2xJBik,55.8,203,88,0,5,207,20,220,1.697e-64,209.0,224,221 +M-VPo39ep4Utq6_Fru4JuNM9rp3Wmu45,tzmklOd_BQ4zCXJ8OiXEoLGyjQ3LYUuD,43.7,359,189,0,18,354,1,359,3.3310000000000003e-78,256.0,364,376 +KbJwxUWA9hBamAnz7boyic9xuL7YpdFp,52NDrUtW4VpoW_5metaigrwcSiNy4nLt,62.2,254,96,0,6,259,1,254,3.0979999999999996e-92,291.0,261,256 +uYr-bewprIg6Swc5EWnzKCpQiTuVowir,1WwuCuwJXaYDZN9b47_IRfrTu8BwWEKe,64.7,321,113,0,1,321,13,333,9.059e-133,412.0,322,334 +abYaRpVeBw6U9ZaXiEMND4z-J7_bnwBW,4lkDnHjbCLe38TRMATM7Q7stnchNYbGt,39.5,378,226,0,3,376,8,385,3.512e-64,216.0,389,396 +TT9-VE-A9iUNQx8UtgoRlcKHXw521wxV,KCRfjiLoK7SiZW5lvuqxj5cL71La0pHA,61.6,271,103,0,1,269,78,348,1.041e-107,337.0,274,353 +ennTHgEojAJicxoKyyb-1MWJz5q0YNGv,90OhAlVQq5BrRFnV44GitGEpepY_zZEr,54.0,270,123,0,1,269,1,270,7.959e-98,308.0,269,270 +ennTHgEojAJicxoKyyb-1MWJz5q0YNGv,wNs7ekpuD-RDwVfiqLO0eg2U6NjQpHg-,51.8,268,128,0,1,267,1,268,4.7550000000000005e-96,303.0,269,270 +BIkruFDUoKjckgk0j6oZUrBk_oAj16YG,wNs7ekpuD-RDwVfiqLO0eg2U6NjQpHg-,56.8,268,115,0,1,268,1,267,5.731e-103,323.0,279,270 +BIkruFDUoKjckgk0j6oZUrBk_oAj16YG,90OhAlVQq5BrRFnV44GitGEpepY_zZEr,53.9,268,123,0,1,268,1,267,7.279e-96,303.0,279,270 +IU-V8ZvWVd1C1Z-evwLfP_KkJT7mZCvA,AAPVmUvcWZtYkxwRx0EV64-OBFtL8HOd,59.3,325,131,0,4,327,1,325,8.072e-114,359.0,350,328 +orV-86yt1TbEwWGDAkU3bVNpe459Hl1B,LJkzeA6-YkPu5alDDy-m0BbXYw4SKboZ,57.8,171,72,0,7,177,20,190,2.663e-62,201.0,180,192 +HiEbAmsTAFh_WLCbMsjHa7rYMmA3aAVn,RNafvyyc5PZHMVgzF8FkDrHTqXs9010E,57.3,618,258,0,18,635,17,621,6.876e-217,673.0,654,621 +4EeqbFY1V20e1Z8RNjccxJsJaspnPGB1,_xvHFg1H1f43WxPcZT1P8Qbcx60chSxh,68.5,523,163,0,1,523,1,520,8.644e-236,721.0,523,523 +U3Cgn6zYdFSQVEZFTRUnAo1zaJfp9-Q4,4lkDnHjbCLe38TRMATM7Q7stnchNYbGt,61.0,388,151,0,8,394,3,390,1.035e-146,456.0,402,396 +lsxvnXk8kxp2OelIap6TcoguSBVOEMnk,4J1c7flqmFXPi51mMS_00l66F8wcABmX,70.4,632,186,0,6,637,5,635,5.1480000000000005e-288,879.0,657,649 +M-XQN-LZbGTFgj0rp2dFnIsF9bQm9chC,0v6Ldmv9eeUG6igtKY-RTxemiOPO-iG9,65.1,461,159,0,6,466,3,458,9.961000000000001e-172,533.0,474,460 diff --git a/tests/python/parsers/test_fasta.py b/tests/python/parsers/test_fasta.py index c812fc60..b6794fb8 100644 --- a/tests/python/parsers/test_fasta.py +++ b/tests/python/parsers/test_fasta.py @@ -148,9 +148,12 @@ def test_parse_2(): for ak, av in a.assemblies.items(): for lk, lv in av.loci.items(): for fk in lv.features: - assert isinstance(fk.parent_object, molbio.Locus) - del fk.parent_object + assert isinstance(fk.parent, molbio.Locus) + assert isinstance(fk.protein, molbio.Protein) if fk.external_id == "Q5JCW8": + assert fk.protein.uid == "bhdSZvyUK6GzqHTXz4VjgtbefefmHI6x" + del fk.protein + del fk.parent assert fk.all_attributes() == { "description": "tr|Q5JCW8|Q5JCW8_THEKO Hypothetical membrane protein, conserved OS=Thermococcus kodakarensis (strain ATCC BAA-918 / JCM 12380 / KOD1) (Pyrococcus kodakaraensis (strain KOD1)) OX=69014 GN=TK0357 PE=4 SV=1", "external_id": "Q5JCW8", @@ -171,6 +174,10 @@ def test_parse_2(): "uid": "bhdSZvyUK6GzqHTXz4VjgtbefefmHI6x", } if fk.external_id == "Q52500": + assert isinstance(fk.protein, molbio.Protein) + assert fk.protein.uid == "r_qK-VXr_gDUb3NluH3qmgG1I09eOIOT" + del fk.protein + del fk.parent assert fk.all_attributes() == { "description": "sp|Q52500|THSB_THEKO Thermosome subunit beta OS=Thermococcus kodakarensis (strain ATCC BAA-918 / JCM 12380 / KOD1) (Pyrococcus kodakaraensis (strain KOD1)) OX=69014 GN=thsB PE=3 SV=1", "external_id": "Q52500", diff --git a/tests/python/test_autogen.py b/tests/python/test_autogen.py index 58612ebb..139c3fb5 100644 --- a/tests/python/test_autogen.py +++ b/tests/python/test_autogen.py @@ -20,37 +20,37 @@ # def test_filter_assemblies(search_result_df): # result = ProcessSearchResult(search_result_df, 2) # result.filter_assemblies(2) -# assert len(result.df) == 4 -# assert set(result.df["assembly_uid"]) == {"A", "B"} +# assert len(resultprotein_comparisons_df) == 4 +# assert set(resultprotein_comparisons_df["assembly_uid"]) == {"A", "B"} # def test_filter_nucleotides(search_result_df): # result = ProcessSearchResult(search_result_df, 2) # result.filter_nucleotides(2) -# assert len(result.df) == 4 -# assert set(result.df["nucleotide_uid"]) == {"a", "b"} +# assert len(resultprotein_comparisons_df) == 4 +# assert set(resultprotein_comparisons_df["nucleotide_uid"]) == {"a", "b"} # def test_label_clusters(search_result_df): # result = ProcessSearchResult(search_result_df, 2) # result._label_clusters(max_gap=5) -# assert len(result.df) == 6 -# assert set(result.df["cluster"]) == {0, 1, 2} +# assert len(resultprotein_comparisons_df) == 6 +# assert set(resultprotein_comparisons_df["cluster"]) == {0, 1, 2} # def test_calc_intrahits(search_result_df): # result = ProcessSearchResult(search_result_df, 2) # result._label_clusters(max_gap=5) # result._calc_intrahits() -# assert len(result.df) == 6 -# assert set(result.df["cluster_unique_hits"]) == {1, 2} +# assert len(resultprotein_comparisons_df) == 6 +# assert set(resultprotein_comparisons_df["cluster_unique_hits"]) == {1, 2} # def test_process(search_result_df): # result = ProcessSearchResult(search_result_df, 2) # result.process(2, 2, 5) -# assert len(result.df) == 4 -# assert set(result.df["assembly_uid"]) == {"A", "B"} -# assert set(result.df["nucleotide_uid"]) == {"a", "b"} -# assert set(result.df["cluster"]) == {0, 1} -# assert set(result.df["cluster_unique_hits"]) == {1, 2} +# assert len(resultprotein_comparisons_df) == 4 +# assert set(resultprotein_comparisons_df["assembly_uid"]) == {"A", "B"} +# assert set(resultprotein_comparisons_df["nucleotide_uid"]) == {"a", "b"} +# assert set(resultprotein_comparisons_df["cluster"]) == {0, 1} +# assert set(resultprotein_comparisons_df["cluster_unique_hits"]) == {1, 2}