dib-lab · mr-eyes · Jul 26, 2022 · Jul 26, 2022 · Jul 26, 2022 · Jul 26, 2022
diff --git a/.github/workflows/cibuildwheel.yml b/.github/workflows/cibuildwheel.yml
@@ -26,7 +26,7 @@ jobs:
         env:
           CIBW_ENVIRONMENT: "BRANCH_NAME=$(cat branch_name.tmp)"
           CIBW_BUILD_VERBOSITY: 3
-          CIBW_BUILD: cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64
+          CIBW_BUILD: cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64 cp311-manylinux_x86_64 cp36-macosx_x86_64 cp37-macosx_x86_64 cp38-macosx_x86_64 cp38-macosx_universal2 cp38-macosx_arm64 cp39-macosx_x86_64 cp39-macosx_universal2 cp39-macosx_arm64 
           CIBW_SKIP: pp* *-manylinux_{aarch64,ppc64le,s390x}
           CIBW_ARCHS_LINUX: x86_64
           CIBW_MANYLINUX_X86_64_IMAGE: quay.io/pypa/manylinux2014_x86_64:latest

diff --git a/kSpider_version.py b/kSpider_version.py
@@ -52,5 +52,5 @@ def get_version():
     else:
         version_tag = dev_version
 
-    # print(f"[DEBUG VERSION] = {version_tag}")
+    print(f"[DEBUG VERSION] = {version_tag}")
     return version_tag
diff --git a/pykSpider/kSpider2/ks_clustering.py b/pykSpider/kSpider2/ks_clustering.py
@@ -1,37 +1,50 @@
 from __future__ import division
 from collections import defaultdict
-import itertools
-import sys
 import os
-import sqlite3
 import click
 from kSpider2.click_context import cli
-import glob
+import rustworkx as rx
+from tqdm import tqdm
 
 
-class kClusters:
+class Clusters:
+
+    distance_to_col = {
+        "min_cont": 3,
+        "avg_cont": 4,
+        "max_cont": 5,
+        "ani": 6,
+    }
 
-    source = []
-    target = []
-    source2 = []
-    target2 = []
     seq_to_kmers = dict()
     names_map = dict()
-    components = defaultdict(set)
 
-    def __init__(self, logger_obj, index_prefix, cut_off_threshold):
+    def __init__(self, logger_obj, index_prefix, cut_off_threshold, dist_type):
+        self.index_prefix = index_prefix
         self.Logger = logger_obj
+        self.dist_type = dist_type
+        self.edges_batch_number = 10_000_000
         self.names_file = index_prefix + ".namesMap"
         self.cut_off_threshold = cut_off_threshold
         self.seqToKmers_file = index_prefix + "_kSpider_seqToKmersNo.tsv"
         self.pairwise_file = index_prefix + "_kSpider_pairwise.tsv"
-        self.uncovered_seqs = set()
+        self.output = index_prefix + \
+            f"_kSpider_clusters_{cut_off_threshold}%.tsv"
         self.shared_kmers_threshold = 200
-        self.seq_to_clusterid = dict()
-        self.max_cluster_id = 0
         self.Logger.INFO("Loading TSV pairwise file")
         self.load_seq_to_kmers(self.seqToKmers_file)
         self.tsv_get_namesmap()
+        if dist_type not in self.distance_to_col:
+            logger_obj.ERROR("unknown distance!")
+        self.dist_col = self.distance_to_col[dist_type]
+
+        if dist_type == "ani":
+            if not os.path.exists(self.index_prefix + "_kSpider_pairwise.ani_col.tsv"):
+                logger_obj.ERROR(f"ANI was selected, but the ani file {self.index_prefix}_kSpider_pairwise.ani_col.tsv was not found!")
+
+        self.graph = rx.PyGraph()
+        self.nodes_indeces = self.graph.add_nodes_from(
+            list(self.names_map.keys()))
 
     def load_seq_to_kmers(self, tsv):
         with open(tsv) as KMER_COUNT:
@@ -40,162 +53,85 @@ def load_seq_to_kmers(self, tsv):
                 seq_ID, no_of_kmers = tuple(line.strip().split('\t')[1:])
                 self.seq_to_kmers[int(seq_ID)] = int(no_of_kmers)
 
-    def ids_to_names(self, cluster):
-        new_cluster = []
-        for _id in cluster:
-            new_cluster.append(self.names_map[int(_id)])
-
-        return new_cluster
-
     def tsv_get_namesmap(self):
         with open(self.names_file, 'r') as namesMap:
             next(namesMap)  # skip the header
             for row in namesMap:
                 row = row.strip().split()
                 self.names_map[int(row[0])] = row[1]
 
-    def tsv_build_graph(self):
+    def construct_graph(self):
+        batch_counter = 0
+        edges_tuples = []
 
+        print("[i] constructing graph")
         with open(self.pairwise_file, 'r') as pairwise_tsv:
             next(pairwise_tsv)  # skip header
-
-            for row in pairwise_tsv:
-                row = row.strip().split()
-                seq1 = int(row[1])
-                seq2 = int(row[2])
-                shared_kmers = int(row[3])
-                containment = 0.0
-
-                min_seq = float(
-                    min(self.seq_to_kmers[seq1], self.seq_to_kmers[seq2]))
-                containment = shared_kmers / min_seq
-
-                if containment < self.cut_off_threshold:
-                    continue
-
-                if shared_kmers < self.shared_kmers_threshold:
-                    self.source2.append(seq1)
-                    self.target2.append(seq2)
-
-                elif shared_kmers >= self.shared_kmers_threshold:
-                    self.source.append(seq1)
-                    self.target.append(seq2)
-
-            # # For covering clusters with single sequence
-            uncovered_seqs_1 = set(self.names_map.keys()) - \
-                set(self.source).union(set(self.target))
-            for seq in uncovered_seqs_1:
-                self.uncovered_seqs.add(seq)
-
-            # OR:
-            # for i in range(1, len(self.names_map) + 1, 1):
-            #     self.source.append(i)
-            #     self.target.append(i)
-
-    def clustering(self):
-        registers = defaultdict(lambda: None)
-
-        def find(x):
-            l = registers[x]
-            if l is not None:
-                l = find(l)
-                registers[x] = l
-                return l
-            return x
-
-        def union(x, y):
-            lx, ly = find(x), find(y)
-            if lx != ly:
-                registers[lx] = ly
-
-        for i in range(len(self.source)):
-            union(self.source.pop(), self.target.pop())
-
-        for x in registers:
-            self.components[find(x)].add(x)
-
-        temp_components = self.components.copy()
-        self.components.clear()
-
-        for cluster_id, (k, v) in enumerate(temp_components.items(), 1):
-            self.components[cluster_id] = set(v)
-            for seq in v:
-                self.seq_to_clusterid[seq] = cluster_id
-
-        temp_components.clear()
-        self.post_clustering()
-
-    def post_clustering(self):
-        registers2 = defaultdict(lambda: None)
-        local_components = defaultdict(set)
-        covered_seqs = set()
-
-        def find(x):
-            l = registers2[x]
-            if l is not None:
-                l = find(l)
-                registers2[x] = l
-                return l
-            return x
-
-        def union(x, y):
-            lx, ly = find(x), find(y)
-            if lx != ly:
-                registers2[lx] = ly
-
-        for i in range(len(self.source2)):
-            union(self.source2.pop(), self.target2.pop())
-
-        for x in registers2:
-            local_components[find(x)].add(x)
-
-        self.components = dict(self.components)
-
-        covered_clusters = set()
-
-        for cluster2_id, (k, v) in enumerate(local_components.items(), 1):
-
-            for seq in v:
-                covered_seqs.add(seq)
-
-            for seq in v:
-                if seq in self.seq_to_clusterid:
-                    cluster_id = self.seq_to_clusterid[seq]
-                    to_be_added = set()
-
-                    for i in v:
-                        if i not in self.seq_to_clusterid:
-                            to_be_added.add(i)
-
-                    self.components[cluster_id] = self.components[cluster_id].union(
-                        to_be_added)
-                    covered_clusters.add(k)
-                    continue
-
-        self.uncovered_seqs = self.uncovered_seqs - covered_seqs
-        uncovered_clusters = set(local_components.keys()) - covered_clusters
-        max_id = len(self.components)
-        for i, unc in enumerate(uncovered_clusters, 1):
-            max_id += 1
-            self.components[max_id] = local_components[unc]
-
-        for seq in self.uncovered_seqs:
-            max_id += 1
-            self.components[max_id] = {seq}
-
-    def export_kCluster(self):
-        kCluster_file_name = f"kSpider_{self.cut_off_threshold:.2f}%_"
-        kCluster_file_name += os.path.basename(
-            self.pairwise_file).split(".")[0]
-        kCluster_file_name += ".clusters.tsv"
-
-        with open(kCluster_file_name, 'w') as kClusters:
-            kClusters.write("kClust_id\tseqs_ids\n")
-            for cluster_id, (k, v) in enumerate(self.components.items(), 1):
-                kClusters.write(
-                    f"{cluster_id}\t{'|'.join(self.ids_to_names(v))}\n")
-
-        self.Logger.INFO(f"Total Number Of Clusters: {cluster_id}")
+            if self.dist_type == "ani":
+                with open(self.index_prefix + "_kSpider_pairwise.ani_col.tsv") as ani_col_file:
+                    for row in pairwise_tsv:
+                        row = row.strip().split('\t')
+                        seq1 = int(row[0]) - 1
+                        seq2 = int(row[1]) - 1
+                        distance = float(next(ani_col_file).strip()) * 100
+
+                        # don't make graph edge
+                        if distance < self.cut_off_threshold:
+                            continue
+
+                        if batch_counter < self.edges_batch_number:
+                            batch_counter += 1
+                            edges_tuples.append((seq1, seq2, distance))
+                        else:
+                            self.graph.add_edges_from(edges_tuples)
+                            batch_counter = 0
+                            edges_tuples.clear()
+
+                    else:
+                        if len(edges_tuples):
+                            self.graph.add_edges_from(edges_tuples)
+            else:
+                for row in pairwise_tsv:
+                    row = row.strip().split('\t')
+                    seq1 = int(row[0]) - 1
+                    seq2 = int(row[1]) - 1
+                    distance = float(row[self.dist_col]) * 100
+
+                    # don't make graph edge
+                    if distance < self.cut_off_threshold:
+                        continue
+
+                    if batch_counter < self.edges_batch_number:
+                        batch_counter += 1
+                        edges_tuples.append((seq1, seq2, distance))
+                    else:
+                        self.graph.add_edges_from(edges_tuples)
+                        batch_counter = 0
+                        edges_tuples.clear()
+
+                else:
+                    if len(edges_tuples):
+                        self.graph.add_edges_from(edges_tuples)
+
+    def cluster_graph(self):
+        self.connected_components = rx.connected_components(self.graph)
+        self.Logger.INFO(
+            f"number of clusters: {len(self.connected_components)}")
+        single_components = 0
+        retworkx_export = self.index_prefix + \
+            f"_kSpider_graph_{self.cut_off_threshold}%.json"
+        # and {self.output} ...")
+        self.Logger.INFO(f"writing {retworkx_export}")
+        # rx.node_link_json(self.graph, path = retworkx_export)
+        with open(self.output, 'w') as CLUSTERS:
+            for component in self.connected_components:
+                # uncomment to exclude single genome clusters from exporting
+                # if len(component) == 1:
+                #     single_components += 1
+                #     continue
+                named_component = [self.names_map[node + 1]
+                                   for node in component]
+                CLUSTERS.write(','.join(named_component) + '\n')
 
 
 """
@@ -211,15 +147,14 @@ def export_kCluster(self):
 @cli.command(name="cluster", help_priority=4)
 @click.option('-c', '--cutoff', required=False, type=click.FloatRange(0, 1, clamp=False), default=0.0, show_default=True, help="cluster sequences with (containment > cutoff)")
 @click.option('-i', '--index-prefix', "index_prefix", required=True, type=click.STRING, help="Index file prefix")
+@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_containment', 'avg_containment', 'max_containment', 'ani']")
 @click.pass_context
-def main(ctx, index_prefix, cutoff):
+def main(ctx, index_prefix, cutoff, distance_type):
     """Sequence clustering."""
-
-    kCl = kClusters(logger_obj=ctx.obj,
-                    index_prefix=index_prefix, cut_off_threshold=cutoff)
+    cutoff = float(cutoff) * 100
+    kCl = Clusters(logger_obj=ctx.obj, index_prefix=index_prefix,
+                   cut_off_threshold=cutoff, dist_type=distance_type)
     ctx.obj.INFO("Building the main graph...")
-    kCl.tsv_build_graph()
+    kCl.construct_graph()
     ctx.obj.INFO("Clustering...")
-    kCl.clustering()
-    ctx.obj.INFO("Exporting ...")
-    kCl.export_kCluster()
+    kCl.cluster_graph()
diff --git a/pykSpider/kSpider2/ks_dataset_indexing.py b/pykSpider/kSpider2/ks_dataset_indexing.py
@@ -9,9 +9,9 @@
 from glob import glob
 
 
-@cli.command(name="index", help_priority=2) 
-@click.option('--dir', "sketches_dir", required = True, help="Sketches directory (must contain only the sketches)")
-@click.option('-k', '--kmer-size', "kSize", required=False, default = 0, type=click.INT, help="kmer size (only if using --sourmash)")
+@cli.command(name="index", help_priority=2)
+@click.option('--dir', "sketches_dir", required=True, help="Sketches directory (must contain only the sketches)")
+@click.option('-k', '--kmer-size', "kSize", required=False, default=0, type=click.INT, help="kmer size (only if using --sourmash)")
 @click.option('--sourmash', "sourmash", is_flag=True, show_default=True, default=False, help="use sourmash sigs instead of kProcessor")
 @click.pass_context
 def main(ctx, sketches_dir, sourmash, kSize):
@@ -20,20 +20,23 @@ def main(ctx, sketches_dir, sourmash, kSize):
     """
     if not os.path.exists(sketches_dir):
         ctx.obj.ERROR(f"{sketches_dir} does not exist!")
-    
+
     if sourmash:
         if not kSize:
             ctx.obj.ERROR(f"must select kSize when using --sourmash")
-        ctx.obj.INFO(f"Indexing sourmash sigs in {sketches_dir} with kSize={kSize}.")
+        ctx.obj.INFO(
+            f"Indexing sourmash sigs in {sketches_dir} with kSize={kSize}.")
         kSpider_internal.sourmash_sigs_indexing(sketches_dir, kSize)
+        ctx.obj.SUCCESS("DONE!")
 
-    else:        
+    else:
         all_extra = list(glob(f"{sketches_dir}/*extra"))
-        all_sketches_phmap = glob(f"{sketches_dir}/*phmap")    
-        all_sketches_mqf = glob(f"{sketches_dir}/*mqf")    
-        
+        all_sketches_phmap = glob(f"{sketches_dir}/*phmap")
+        all_sketches_mqf = glob(f"{sketches_dir}/*mqf")
+
         if len(all_extra) != (len(all_sketches_phmap) + len(all_sketches_mqf)):
             ctx.obj.ERROR(f"Inconsistent sketches files.")
-        
+
         ctx.obj.INFO(f"Indexing sketches in {sketches_dir}.")
         kSpider_internal.index_datasets(sketches_dir)
+        ctx.obj.SUCCESS("DONE!")