feat: large update (#67)

* feat: large update fix: info to metadata.update style: switch warning to info fix: perc and fixed instead of just fixed thresholds fix: perc and fixed instead of just fixed thresholds fix: hmmscan fix: these were modified because... these were modified because now the hmm file is auto check whether the GA threshold exists and to use it if it does fix: these were modified because these were modified because now the hmm file is auto check whether the GA threshold exists and to use it if it does fix: these were modified because... fix: normalize search filters to n queries fix: make id attributes consistent across classes feat: this is a breaking change for previously built database fix: bump ci/cv feat: add gene cluster obj and move cmap serialize feat: add bypass for protein id when no locus tag fix: single and multi group results fix: clustermap feat: serialize to gbk * fix: W605 invalid escape sequence
socialgene · Nov 8, 2023 · fc8cb37 · fc8cb37
1 parent 8f4ba8f
commit fc8cb37
Show file tree

Hide file tree

Showing 136 changed files with 11,125 additions and 1,854 deletions.
diff --git a/.github/workflows/dev_ci.yml b/.github/workflows/dev_ci.yml
@@ -13,16 +13,17 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
-
+      matrix:
+        python-version: ["3.11", "3.12"]
     env:
       CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
+          python-version: ${{ matrix.python-version }}
       - name: Install sg
         run: python3 -m pip install -e .[ci]
       - name: Install hmmer
@@ -35,9 +36,9 @@ jobs:
     if: ${{ !contains(github.event.head_commit, 'ci skip') || !contains(github.event.head_commit, 'from socialgene/release-please') }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: "3.11"
       - run: python -m pip install flake8
@@ -47,6 +48,6 @@ jobs:
     if: ${{ !contains(github.event.head_commit, 'ci skip') || !contains(github.event.head_commit, 'from socialgene/release-please') }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: black
         uses: psf/black@stable
diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml
@@ -21,9 +21,9 @@ jobs:
     if: ${{ !contains(github.event.head_commit, 'ci skip') || !contains(github.event.head_commit, 'from socialgene/release-please') }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: "3.10"
       - run: python -m pip install flake8
@@ -34,6 +34,6 @@ jobs:
     if: ${{ !contains(github.event.head_commit, 'ci skip') || !contains(github.event.head_commit, 'from socialgene/release-please') }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: black
         uses: psf/black@stable
diff --git a/.github/workflows/pr_ci.yml b/.github/workflows/pr_ci.yml
@@ -14,14 +14,14 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.11", "3.x"]
+        python-version: ["3.11", "3.12"]
     env:
       CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install sg

diff --git a/Makefile b/Makefile
@@ -20,7 +20,6 @@ clean:
 	find . -type f -name "*.py[co]" -exec rm -r {} +
 	find . -type d -name "__pycache__" -exec rm -r {} +
 	find . -type d -name "htmlcov" -exec rm -r {} +
-	find . -type d -name "Autometa.egg-info" -exec rm -r {} +
 	find . -type d -name "dist" -exec rm -r {} +
 	find . -type d -name "build" -exec rm -r {} +
 	find . -type d -name ".eggs" -exec rm -r {} +

diff --git a/Take an input BGC.md b/Take an input BGC.md
@@ -0,0 +1,30 @@
+# a
+
+1. Take an input BGC
+2. Parse all the proteins in the input BGC
+3. Annotate every input BGC proteins with HMM models, either pulling from Neo4j if the exact protein exists, or with HMMER
+4. Get the list of all HMM models annotating input BGC proteins
+5. Query the database to find the outdegree (calculate/populate if necessary) for each HMM model. (:hmm)-[:ANNOTATES]->(:protein)
+6. Prioritize whcih input proteins to search based on the outdegree of its HMM annotations
+    - max_query_proteins (float): Max proteins to return. If >0 and <1, will return X% of input proteins
+    - max_domains_per_protein (int): Max domains to retain for each individual protein (highest outdegree dropped first)
+    - max_outdegree (int): HMM model annotations with an outdegree higher than this will be dropped
+    - scatter (bool, optional): Choose a random subset of proteins to search that are spread across the length of the input BGC. Defaults to False.
+    - bypass (List[str], optional): List of locus tags that will bypass filtering. This is the ID found in a GenBank file "/locus_tag=" field. Defaults to None.
+    - bypass_eid (List[str], optional): Less preferred than `bypass`. List of external protein IDs that will bypass filtering. This is the ID found in a GenBank file "/protein_id=" field. Defaults to None.
+7. Search the database for all proteins that have the same HMM model annotations as the input BGC proteins
+    - Output from database is a data frame with columns: ['assembly_uid', 'nucleotide_uid', 'target', 'n_start', 'n_end', 'query']
+8. The initial hits output is filtered based on the following criteria:
+    - assemblies_must_have_x_matches (float): Minimum number of distinct query protein matches for a genome to be considered. <1 is a fraction of the number of query proteins, >1 is the number of query proteins.
+    - nucleotide_sequences_must_have_x_matches (float): Minimum number of distinct query protein matches required for a nucleotide sequence to be considered. <1 is a fraction of the number of query proteins, >1 is the number of query proteins.
+9. All remaining loci are then assigned a cluster, simply moving from start to end and "breaking" into a new cluster if the distance between two loci is > `break_bgc_on_gap_of`
+10. Resulting clusters are then filtered to those that contain >= `gene_clusters_must_have_x_matches`
+11. Remaining clusters are pulled into the SocialGene object
+    - All loci within each clister and +/- `target_bgc_padding` base pairs are pulled from the database
+    - All proteins within each cluster are pulled from the database
+    - All HMM models annotating proteins within each cluster are pulled from the database
+13. Each target BGC is compared again to the input BGC (via domain similarity)
+14. Links are created between the input BGC and the target BGCs
+15. "Groups" are created by assigning a target protein to a single input BGC protein based on best-hit
+    - This is for assigning groups within clustermap.js for visualization purposes only
+16. Results are serialized to clustermap.js JSON format and written to disk
diff --git a/classes_sgpy.png b/classes_sgpy.png
diff --git a/new_search.py b/new_search.py
@@ -1,47 +1,88 @@
-import time
-from socialgene.search.hmmer import SearchDomains
-from socialgene.base.socialgene import SocialGene
-
-now = time.time()
+# The code is performing a series of operations using the SocialGene package and the SearchDomains
+# class from the socialgene.search.hmmer module. Here is a breakdown of what the code is doing:
 
+from socialgene.base.socialgene import SocialGene
+from socialgene.clustermap.serialize import SerializeToClustermap
+from socialgene.search.hmmer import SearchDomains
 
-input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001850.gbk"
+input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001646.gbk"
+# input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001848.gbk"
 hmm_dir = None
-hmm_dir = (
-    "/home/chase/Documents/socialgene_data/streptomyces/socialgene_per_run/hmm_cache"
-)
-
+# hmm_dir = "/home/chase/Downloads/meh/socialgene_per_run/hmm_cache"
+hmm_dir = "/home/chase/Downloads/para/socialgene_per_run/hmm_cache"
 
 a = SocialGene()
 a.parse(input_gbk_path)
+len(a.proteins)
 
 search_object = SearchDomains(
     gbk_path=input_gbk_path,
     hmm_dir=hmm_dir,
-    assemblies_must_have_x_matches=10,
-    nucleotide_sequences_must_have_x_matches=10,
-    gene_clusters_must_have_x_matches=10,
+    use_neo4j_precalc=True,
+    assemblies_must_have_x_matches=0.4,
+    nucleotide_sequences_must_have_x_matches=0.4,
+    gene_clusters_must_have_x_matches=0.2,
+    break_bgc_on_gap_of=10000,
+    target_bgc_padding=15000,
 )
+self = search_object
 
+search_object.outdegree_table
 
-search_object.outdegree_table()
 
 search_object.prioritize_input_proteins(
-    max_domains_per_protein=1,
+    max_domains_per_protein=None,
     max_outdegree=None,
-    max_query_proteins=20,
+    max_query_proteins=None,
+    scatter=False,
+    # loci=["MicB006_2899"]
+    # bypass_eid=["AXA20096.1"],
 )
-search_object.outdegree_table()
 
+search_object.outdegree_table
+
+# TODO frac is redundant with the outdegree table
+search_object.search(only_culture_collection=False, frac=0.75)
+
+
+search_object.filter()
+search_object.label_clusters()
+
+
+search_object._bgc_regions_to_sg_object(self._primary_bgc_regions())
+
+_ = self.sg_object.annotate_proteins_with_neo4j(
+    protein_uids=None, annotate_all=True, progress=False
+)
+
+
+df = self._primary_bgc_regions()
+
+
+for i, row in df.iterrows():
+    self.sg_object.assemblies[row["assembly_uid"]].get_locus_by_uid(
+        row["nucleotide_uid"]
+    ).add_bgcs_by_start_end(
+        start=row["n_start"],
+        end=row["n_end"],
+        uid=row["cluster"],
+    )
+
+
+temp = self.input_assembly.loci[self.input_bgc_id]
+
+temp.add_bgcs_by_feature(features=temp.features)
 
-search_object.search_db()
 
-search_object.process_results()
+self._create_links()
+self._choose_group()
 
 
-search_object.write_clustermap(
-    "/home/chase/Downloads/clinker-master(2)/clinker-master/clinker/plot/data.json"
+z = SerializeToClustermap(
+    sg_object=self.sg_object,
+    bgc_order=list(self.sg_object.assemblies.keys()),
+    link_df=self.link_df,
+    group_df=self.group_df,
 )
 
-later = time.time()
-int(later - now)
+z.write("/home/chase/Downloads/clinker-master(2)/clinker-master/clinker/plot/data.json")
diff --git a/new_search2.py b/new_search2.py
@@ -0,0 +1,88 @@
+import time
+
+from socialgene.clustermap.serialize import SerializeToClustermap
+from socialgene.search.hmmer import SearchDomains
+
+start_time = time.time()
+# input_gbk_path = "/home/chase/Downloads/para/a/GCF_002362315.1_ASM236231v1_genomic.gbff.gz"
+input_gbk_path = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0001646.gbk"
+hmm_dir = None
+# hmm_dir = "/home/chase/Downloads/meh/socialgene_per_run/hmm_cache"
+hmm_dir = "/home/chase/Downloads/para/socialgene_per_run/hmm_cache"
+
+
+search_object = SearchDomains(
+    gbk_path=input_gbk_path,
+    hmm_dir=hmm_dir,
+    use_neo4j_precalc=True,
+    assemblies_must_have_x_matches=0.6,
+    nucleotide_sequences_must_have_x_matches=0.6,
+    gene_clusters_must_have_x_matches=0.6,
+    break_bgc_on_gap_of=10000,
+    target_bgc_padding=15000,
+)
+
+
+# search_object.outdegree_table
+
+
+search_object.prioritize_input_proteins(
+    max_domains_per_protein=2,
+    max_outdegree=100000,
+    max_query_proteins=5,
+    scatter=False,
+    # bypass_locus=["MicB006_2899"]
+    # bypass_pid=["AXA20096.1"],
+)
+
+# search_object.outdegree_table
+
+# TODO frac is redundant with the outdegree table
+search_object.search(only_culture_collection=False, frac=0.75)
+
+
+search_object.filter()
+search_object.label_clusters()
+
+
+search_object._bgc_regions_to_sg_object(search_object._primary_bgc_regions())
+
+_ = search_object.sg_object.annotate_proteins_with_neo4j(
+    protein_uids=None, annotate_all=True, progress=False
+)
+
+
+df = search_object._primary_bgc_regions()
+
+
+for i, row in df.iterrows():
+    search_object.sg_object.assemblies[row["assembly_uid"]].get_locus_by_uid(
+        row["nucleotide_uid"]
+    ).add_bgcs_by_start_end(
+        start=row["n_start"],
+        end=row["n_end"],
+        uid=row["cluster"],
+    )
+
+
+temp = search_object.input_assembly.loci[search_object.input_bgc_id]
+temp.add_bgcs_by_feature(features=temp.features)
+
+search_object._create_links()
+search_object._choose_group()
+
+
+z = SerializeToClustermap(
+    sg_object=search_object.sg_object,
+    bgc_order=list(search_object.sg_object.assemblies.keys()),
+    link_df=search_object.link_df,
+    group_df=search_object.group_df,
+)
+
+z.write("/home/chase/Downloads/clinker-master(2)/clinker-master/clinker/plot/data.json")
+print("--- %s seconds ---" % (time.time() - start_time))
+
+# search_object.rich_table(search_object.user_friendly_hit_df())
+
+
+# self = search_object
diff --git a/packages_sgpy.png b/packages_sgpy.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools", "setuptools-scm"]
+requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 
 [project]

diff --git a/socialgene/addons/ttd.py b/socialgene/addons/ttd.py
@@ -5,7 +5,7 @@
 
 # sg_obj = SocialGene()
 # fapath = "/home/chase/Downloads/ttd_database/P2-06-TTD_sequence_all.txt"
-# protein_id = 1
+# external_id = 1
 # with open(fapath, "rt") as h:
 #     sequence = ""
 #     after_header = False
@@ -15,9 +15,9 @@
 #             if sequence:
 #                 sg_obj.add_protein(
 #                     sequence=sequence,
-#                     external_protein_id=protein_id,
+#                     external_id=external_id,
 #                 )
-#             protein_id = re.search("T[0-9]{5}", line).group()
+#             external_id = re.search("T[0-9]{5}", line).group()
 #             sequence = ""
 #         elif after_header:
 #             sequence += line.strip()