Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build wheels for MACOS #30

Open
wants to merge 50 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
3ec6340
validate bin with sig
mr-eyes Jul 26, 2022
cbd4582
bins indexing implemented
mr-eyes Jul 26, 2022
d5de079
pairwise minor modifications
mr-eyes Jul 26, 2022
7905a48
dumping modes
mr-eyes Jul 26, 2022
0323e03
minor fix
mr-eyes Jul 26, 2022
b2d1507
:bug: handle gzipped sigs
mr-eyes Jul 26, 2022
c7585a0
sig to bin
mr-eyes Jul 26, 2022
652b434
sigs to bins update
mr-eyes Jul 26, 2022
d75160c
:fix: all parallel
mr-eyes Jul 26, 2022
265052a
refactot
mr-eyes Jul 26, 2022
d86748b
remove print inside parallel loop
mr-eyes Jul 26, 2022
ab314b1
:fix: validate
mr-eyes Jul 26, 2022
841df99
skip converted files
mr-eyes Jul 26, 2022
4b33759
sequential loading
mr-eyes Jul 26, 2022
192d538
legends to phmap
mr-eyes Jul 26, 2022
44121ae
check invalid bins
mr-eyes Jul 28, 2022
a82c719
bins indexing done
mr-eyes Jul 28, 2022
0980f2e
update kProcessor submodule
mr-eyes Jul 28, 2022
d6098eb
filter by abundance
mr-eyes Jul 29, 2022
725afbc
delete kProcessor submodule
mr-eyes Jul 29, 2022
451b7e4
modify kProcessor
mr-eyes Jul 29, 2022
28eb55f
new json parser
mr-eyes Jul 30, 2022
d5b61d9
adapt the new json parser changes
mr-eyes Jul 30, 2022
efb6c10
modify json import
mr-eyes Jul 31, 2022
278e346
update kProcessor
mr-eyes Jul 31, 2022
c929708
Merge branch 'kp2_again' of github.com:dib-lab/kSpider into bins
mr-eyes Jul 31, 2022
806a347
update kProcessor
mr-eyes Jul 31, 2022
97d8f23
update kProcessor
mr-eyes Jul 31, 2022
931d6e5
update kProcessor
mr-eyes Jul 31, 2022
cba0e30
update kProcessor
mr-eyes Jul 31, 2022
e810ccb
update kProcessor
mr-eyes Jul 31, 2022
e46e22f
update kp
mr-eyes Jul 31, 2022
17bdf1a
update CMAKE flags
mr-eyes Jul 31, 2022
949a84a
print colors size
mr-eyes Aug 2, 2022
bbcefdb
more stats
mr-eyes Aug 3, 2022
18ea794
more options
mr-eyes Aug 3, 2022
63616bc
fix
mr-eyes Aug 3, 2022
c33420d
representative sketches
mr-eyes Nov 21, 2022
373240f
reorganize
mr-eyes Dec 15, 2022
e459885
Merge branch 'master' of github.com:dib-lab/kSpider into dev
mr-eyes Dec 28, 2022
a2f0e29
support python 3.11
mr-eyes Dec 29, 2022
357c0b2
use retworkx in clustering
mr-eyes Dec 29, 2022
2bf33c2
use retworkx in clustering
mr-eyes Dec 29, 2022
d9f29d0
other distances
mr-eyes Dec 29, 2022
03ce313
add depends
mr-eyes Dec 29, 2022
efd9fcf
cout edit
mr-eyes Dec 29, 2022
f38981e
support ANI and add some options
mr-eyes Dec 30, 2022
0357fbd
fix serialization bug
mr-eyes Jan 1, 2023
9dd9f71
updates
mr-eyes Jan 1, 2023
1463263
Update cibuildwheel.yml
mr-eyes Jan 1, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cibuildwheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
env:
CIBW_ENVIRONMENT: "BRANCH_NAME=$(cat branch_name.tmp)"
CIBW_BUILD_VERBOSITY: 3
CIBW_BUILD: cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64
CIBW_BUILD: cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64 cp311-manylinux_x86_64 cp36-macosx_x86_64 cp37-macosx_x86_64 cp38-macosx_x86_64 cp38-macosx_universal2 cp38-macosx_arm64 cp39-macosx_x86_64 cp39-macosx_universal2 cp39-macosx_arm64
CIBW_SKIP: pp* *-manylinux_{aarch64,ppc64le,s390x}
CIBW_ARCHS_LINUX: x86_64
CIBW_MANYLINUX_X86_64_IMAGE: quay.io/pypa/manylinux2014_x86_64:latest
Expand Down
2 changes: 1 addition & 1 deletion kSpider_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,5 @@ def get_version():
else:
version_tag = dev_version

# print(f"[DEBUG VERSION] = {version_tag}")
print(f"[DEBUG VERSION] = {version_tag}")
return version_tag
271 changes: 103 additions & 168 deletions pykSpider/kSpider2/ks_clustering.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,50 @@
from __future__ import division
from collections import defaultdict
import itertools
import sys
import os
import sqlite3
import click
from kSpider2.click_context import cli
import glob
import rustworkx as rx
from tqdm import tqdm


class kClusters:
class Clusters:

distance_to_col = {
"min_cont": 3,
"avg_cont": 4,
"max_cont": 5,
"ani": 6,
}

source = []
target = []
source2 = []
target2 = []
seq_to_kmers = dict()
names_map = dict()
components = defaultdict(set)

def __init__(self, logger_obj, index_prefix, cut_off_threshold):
def __init__(self, logger_obj, index_prefix, cut_off_threshold, dist_type):
self.index_prefix = index_prefix
self.Logger = logger_obj
self.dist_type = dist_type
self.edges_batch_number = 10_000_000
self.names_file = index_prefix + ".namesMap"
self.cut_off_threshold = cut_off_threshold
self.seqToKmers_file = index_prefix + "_kSpider_seqToKmersNo.tsv"
self.pairwise_file = index_prefix + "_kSpider_pairwise.tsv"
self.uncovered_seqs = set()
self.output = index_prefix + \
f"_kSpider_clusters_{cut_off_threshold}%.tsv"
self.shared_kmers_threshold = 200
self.seq_to_clusterid = dict()
self.max_cluster_id = 0
self.Logger.INFO("Loading TSV pairwise file")
self.load_seq_to_kmers(self.seqToKmers_file)
self.tsv_get_namesmap()
if dist_type not in self.distance_to_col:
logger_obj.ERROR("unknown distance!")
self.dist_col = self.distance_to_col[dist_type]

if dist_type == "ani":
if not os.path.exists(self.index_prefix + "_kSpider_pairwise.ani_col.tsv"):
logger_obj.ERROR(f"ANI was selected, but the ani file {self.index_prefix}_kSpider_pairwise.ani_col.tsv was not found!")

self.graph = rx.PyGraph()
self.nodes_indeces = self.graph.add_nodes_from(
list(self.names_map.keys()))

def load_seq_to_kmers(self, tsv):
with open(tsv) as KMER_COUNT:
Expand All @@ -40,162 +53,85 @@ def load_seq_to_kmers(self, tsv):
seq_ID, no_of_kmers = tuple(line.strip().split('\t')[1:])
self.seq_to_kmers[int(seq_ID)] = int(no_of_kmers)

def ids_to_names(self, cluster):
new_cluster = []
for _id in cluster:
new_cluster.append(self.names_map[int(_id)])

return new_cluster

def tsv_get_namesmap(self):
with open(self.names_file, 'r') as namesMap:
next(namesMap) # skip the header
for row in namesMap:
row = row.strip().split()
self.names_map[int(row[0])] = row[1]

def tsv_build_graph(self):
def construct_graph(self):
batch_counter = 0
edges_tuples = []

print("[i] constructing graph")
with open(self.pairwise_file, 'r') as pairwise_tsv:
next(pairwise_tsv) # skip header

for row in pairwise_tsv:
row = row.strip().split()
seq1 = int(row[1])
seq2 = int(row[2])
shared_kmers = int(row[3])
containment = 0.0

min_seq = float(
min(self.seq_to_kmers[seq1], self.seq_to_kmers[seq2]))
containment = shared_kmers / min_seq

if containment < self.cut_off_threshold:
continue

if shared_kmers < self.shared_kmers_threshold:
self.source2.append(seq1)
self.target2.append(seq2)

elif shared_kmers >= self.shared_kmers_threshold:
self.source.append(seq1)
self.target.append(seq2)

# # For covering clusters with single sequence
uncovered_seqs_1 = set(self.names_map.keys()) - \
set(self.source).union(set(self.target))
for seq in uncovered_seqs_1:
self.uncovered_seqs.add(seq)

# OR:
# for i in range(1, len(self.names_map) + 1, 1):
# self.source.append(i)
# self.target.append(i)

def clustering(self):
registers = defaultdict(lambda: None)

def find(x):
l = registers[x]
if l is not None:
l = find(l)
registers[x] = l
return l
return x

def union(x, y):
lx, ly = find(x), find(y)
if lx != ly:
registers[lx] = ly

for i in range(len(self.source)):
union(self.source.pop(), self.target.pop())

for x in registers:
self.components[find(x)].add(x)

temp_components = self.components.copy()
self.components.clear()

for cluster_id, (k, v) in enumerate(temp_components.items(), 1):
self.components[cluster_id] = set(v)
for seq in v:
self.seq_to_clusterid[seq] = cluster_id

temp_components.clear()
self.post_clustering()

def post_clustering(self):
registers2 = defaultdict(lambda: None)
local_components = defaultdict(set)
covered_seqs = set()

def find(x):
l = registers2[x]
if l is not None:
l = find(l)
registers2[x] = l
return l
return x

def union(x, y):
lx, ly = find(x), find(y)
if lx != ly:
registers2[lx] = ly

for i in range(len(self.source2)):
union(self.source2.pop(), self.target2.pop())

for x in registers2:
local_components[find(x)].add(x)

self.components = dict(self.components)

covered_clusters = set()

for cluster2_id, (k, v) in enumerate(local_components.items(), 1):

for seq in v:
covered_seqs.add(seq)

for seq in v:
if seq in self.seq_to_clusterid:
cluster_id = self.seq_to_clusterid[seq]
to_be_added = set()

for i in v:
if i not in self.seq_to_clusterid:
to_be_added.add(i)

self.components[cluster_id] = self.components[cluster_id].union(
to_be_added)
covered_clusters.add(k)
continue

self.uncovered_seqs = self.uncovered_seqs - covered_seqs
uncovered_clusters = set(local_components.keys()) - covered_clusters
max_id = len(self.components)
for i, unc in enumerate(uncovered_clusters, 1):
max_id += 1
self.components[max_id] = local_components[unc]

for seq in self.uncovered_seqs:
max_id += 1
self.components[max_id] = {seq}

def export_kCluster(self):
kCluster_file_name = f"kSpider_{self.cut_off_threshold:.2f}%_"
kCluster_file_name += os.path.basename(
self.pairwise_file).split(".")[0]
kCluster_file_name += ".clusters.tsv"

with open(kCluster_file_name, 'w') as kClusters:
kClusters.write("kClust_id\tseqs_ids\n")
for cluster_id, (k, v) in enumerate(self.components.items(), 1):
kClusters.write(
f"{cluster_id}\t{'|'.join(self.ids_to_names(v))}\n")

self.Logger.INFO(f"Total Number Of Clusters: {cluster_id}")
if self.dist_type == "ani":
with open(self.index_prefix + "_kSpider_pairwise.ani_col.tsv") as ani_col_file:
for row in pairwise_tsv:
row = row.strip().split('\t')
seq1 = int(row[0]) - 1
seq2 = int(row[1]) - 1
distance = float(next(ani_col_file).strip()) * 100

# don't make graph edge
if distance < self.cut_off_threshold:
continue

if batch_counter < self.edges_batch_number:
batch_counter += 1
edges_tuples.append((seq1, seq2, distance))
else:
self.graph.add_edges_from(edges_tuples)
batch_counter = 0
edges_tuples.clear()

else:
if len(edges_tuples):
self.graph.add_edges_from(edges_tuples)
else:
for row in pairwise_tsv:
row = row.strip().split('\t')
seq1 = int(row[0]) - 1
seq2 = int(row[1]) - 1
distance = float(row[self.dist_col]) * 100

# don't make graph edge
if distance < self.cut_off_threshold:
continue

if batch_counter < self.edges_batch_number:
batch_counter += 1
edges_tuples.append((seq1, seq2, distance))
else:
self.graph.add_edges_from(edges_tuples)
batch_counter = 0
edges_tuples.clear()

else:
if len(edges_tuples):
self.graph.add_edges_from(edges_tuples)

def cluster_graph(self):
self.connected_components = rx.connected_components(self.graph)
self.Logger.INFO(
f"number of clusters: {len(self.connected_components)}")
single_components = 0
retworkx_export = self.index_prefix + \
f"_kSpider_graph_{self.cut_off_threshold}%.json"
# and {self.output} ...")
self.Logger.INFO(f"writing {retworkx_export}")
# rx.node_link_json(self.graph, path = retworkx_export)
with open(self.output, 'w') as CLUSTERS:
for component in self.connected_components:
# uncomment to exclude single genome clusters from exporting
# if len(component) == 1:
# single_components += 1
# continue
named_component = [self.names_map[node + 1]
for node in component]
CLUSTERS.write(','.join(named_component) + '\n')


"""
Expand All @@ -211,15 +147,14 @@ def export_kCluster(self):
@cli.command(name="cluster", help_priority=4)
@click.option('-c', '--cutoff', required=False, type=click.FloatRange(0, 1, clamp=False), default=0.0, show_default=True, help="cluster sequences with (containment > cutoff)")
@click.option('-i', '--index-prefix', "index_prefix", required=True, type=click.STRING, help="Index file prefix")
@click.option('-d', '--dist-type', "distance_type", required=False, default="max_cont", show_default=True, type=click.STRING, help="select from ['min_containment', 'avg_containment', 'max_containment', 'ani']")
@click.pass_context
def main(ctx, index_prefix, cutoff):
def main(ctx, index_prefix, cutoff, distance_type):
"""Sequence clustering."""

kCl = kClusters(logger_obj=ctx.obj,
index_prefix=index_prefix, cut_off_threshold=cutoff)
cutoff = float(cutoff) * 100
kCl = Clusters(logger_obj=ctx.obj, index_prefix=index_prefix,
cut_off_threshold=cutoff, dist_type=distance_type)
ctx.obj.INFO("Building the main graph...")
kCl.tsv_build_graph()
kCl.construct_graph()
ctx.obj.INFO("Clustering...")
kCl.clustering()
ctx.obj.INFO("Exporting ...")
kCl.export_kCluster()
kCl.cluster_graph()
23 changes: 13 additions & 10 deletions pykSpider/kSpider2/ks_dataset_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from glob import glob


@cli.command(name="index", help_priority=2)
@click.option('--dir', "sketches_dir", required = True, help="Sketches directory (must contain only the sketches)")
@click.option('-k', '--kmer-size', "kSize", required=False, default = 0, type=click.INT, help="kmer size (only if using --sourmash)")
@cli.command(name="index", help_priority=2)
@click.option('--dir', "sketches_dir", required=True, help="Sketches directory (must contain only the sketches)")
@click.option('-k', '--kmer-size', "kSize", required=False, default=0, type=click.INT, help="kmer size (only if using --sourmash)")
@click.option('--sourmash', "sourmash", is_flag=True, show_default=True, default=False, help="use sourmash sigs instead of kProcessor")
@click.pass_context
def main(ctx, sketches_dir, sourmash, kSize):
Expand All @@ -20,20 +20,23 @@ def main(ctx, sketches_dir, sourmash, kSize):
"""
if not os.path.exists(sketches_dir):
ctx.obj.ERROR(f"{sketches_dir} does not exist!")

if sourmash:
if not kSize:
ctx.obj.ERROR(f"must select kSize when using --sourmash")
ctx.obj.INFO(f"Indexing sourmash sigs in {sketches_dir} with kSize={kSize}.")
ctx.obj.INFO(
f"Indexing sourmash sigs in {sketches_dir} with kSize={kSize}.")
kSpider_internal.sourmash_sigs_indexing(sketches_dir, kSize)
ctx.obj.SUCCESS("DONE!")

else:
else:
all_extra = list(glob(f"{sketches_dir}/*extra"))
all_sketches_phmap = glob(f"{sketches_dir}/*phmap")
all_sketches_mqf = glob(f"{sketches_dir}/*mqf")
all_sketches_phmap = glob(f"{sketches_dir}/*phmap")
all_sketches_mqf = glob(f"{sketches_dir}/*mqf")

if len(all_extra) != (len(all_sketches_phmap) + len(all_sketches_mqf)):
ctx.obj.ERROR(f"Inconsistent sketches files.")

ctx.obj.INFO(f"Indexing sketches in {sketches_dir}.")
kSpider_internal.index_datasets(sketches_dir)
ctx.obj.SUCCESS("DONE!")
Loading