From f68dd840f043d712c9282cd3c7b656b51aa0a043 Mon Sep 17 00:00:00 2001
From: Yuxuan Du <55218554+dyxstat@users.noreply.github.com>
Date: Sat, 5 Mar 2022 14:17:01 -0800
Subject: [PATCH] Add files via upload

---
 bin.py                 |  78 ++++++++
 construct_graph.py     | 398 +++++++++++++++++++++++++++++++++++++++++
 exceptions.py          |  52 ++++++
 utils.py               | 109 +++++++++++
 viralcc.py             | 163 +++++++++++++++++
 viralcc_linux_env.yaml | 136 ++++++++++++++
 viralcc_osx_env.yaml   | 107 +++++++++++
 7 files changed, 1043 insertions(+)
 create mode 100644 bin.py
 create mode 100644 construct_graph.py
 create mode 100644 exceptions.py
 create mode 100644 utils.py
 create mode 100644 viralcc.py
 create mode 100644 viralcc_linux_env.yaml
 create mode 100644 viralcc_osx_env.yaml

diff --git a/bin.py b/bin.py
new file mode 100644
index 0000000..70ab57d
--- /dev/null
+++ b/bin.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import numpy as np
+import scipy.sparse as scisp
+from math import log,exp,sqrt
+import logging
+import igraph as ig
+import leidenalg
+from sklearn import silhouette_score
+import os
+
+
+# package logger
+logger = logging.getLogger(__name__)
+
+
+class ClusterBin:
+    def __init__(self, path , viral_info , map_combine , random_seed = 42):
+        '''
+        perc: threshold of spurious contacts
+        viral_info: viral information
+        min_combine: integrative graph
+        '''
+
+        #information of the viral contigs
+        self.path = path
+        self.viral_name = []
+        for i in range(len(viral_info)):
+            temp = viral_info[i]
+            self.viral_name.append(temp.name)
+        
+        
+        ###########use leiden algorithm to do clustering#########
+        self.map_combine = scisp.coo_matrix(map_combine)
+        
+        vcount = self.map_combine.shape[0]
+        sources = self.map_combine.row
+        targets = self.map_combine.col
+        index = sources>targets
+        sources = sources[index]
+        targets = targets[index]
+        edgelist = list(zip(sources, targets))
+        g = ig.Graph(vcount, edgelist)
+
+        SIL_score = []
+        cluster_range = np.arange(2,50)
+        for n_clusters in cluster_range:
+            part = leidenalg.find_partition(g , leidenalg.RBConfigurationVertexPartition , resolution_parameter = n_clusters , n_iterations= -1 , seed = random_seed )
+            part = list(part)
+
+            label_pred = np.ones(self.map_combine.shape[0])
+            for ci in range(len(part)):
+                for contig in part[ci]:
+                    label_pred[contig] = ci
+
+            SIL_score.append(silhouette_score(self.map_combine.todense() , np.array(label_pred)))
+
+        optimal = SIL_score.index(max(SIL_score))
+        part = leidenalg.find_partition(g , leidenalg.RBConfigurationVertexPartition, resolution_parameter = cluster_range[optimal] , n_iterations= -1 , seed = random_seed)
+        part = list(part)
+        logger.info('the number of generated viral bins is {}'.format(len(part)))
+
+        self.dist_cluster = {}
+        for ci in range(len(part)):
+            for id in part[ci]:
+                self.dist_cluster[self.viral_name[id]] = 'group'+str(ci)
+
+        self._write_cluster()
+        
+            
+
+    def _write_cluster(self):
+        ########create file for checkm################
+        with open(os.path.join(self.path ,'cluster_viral_contig.txt'),'w') as out:
+            for key , value in self.dist_cluster.items():
+                out.write(str(key)+ '\t' +str(value))
+                out.write('\n') 
diff --git a/construct_graph.py b/construct_graph.py
new file mode 100644
index 0000000..36c13ba
--- /dev/null
+++ b/construct_graph.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+from collections import OrderedDict, namedtuple
+import Bio.SeqIO as SeqIO
+from Bio.SeqUtils import GC
+from itertools import combinations
+from math import exp, log, sqrt
+import pandas as pd
+import numpy as np
+import pysam
+import scipy.sparse as scisp
+import tqdm
+import csv
+import os
+from utils import count_fasta_sequences, open_input
+import logging
+
+# package logger
+logger = logging.getLogger(__name__)
+
+#######offset is the enumeration of the length######################
+#localid is the index of the list#
+#refid is the index of the fasta file, which is a global index# 
+SeqInfo = namedtuple('SeqInfo', ['localid', 'refid', 'name', 'length', 'GC']) #create a new class of tuple: SeqInfo
+
+   
+
+class Sparse2DAccumulator(object):
+######create a 2D coo sparse matrix###########
+    def __init__(self, N):
+        self.shape = (N, N)
+        self.mat = {}
+        ##mat is a dictionary here
+        # fixed counting type
+        self.dtype = np.uint32
+
+    def setitem(self, index, value):
+        assert len(index) == 2 and index[0] >= 0 and index[1] >= 0, 'invalid index: {}'.format(index)
+        assert isinstance(value, (int, np.int)), 'values must be integers'
+        self.mat[index] = value
+
+    def getitem(self, index):
+        if index in self.mat:
+            return self.mat[index]
+        else:
+            return 0
+
+    def get_coo(self, symm=True):
+        """
+        Create a COO format sparse representation of the accumulated values.
+
+        :param symm: ensure matrix is symmetric on return
+        :return: a scipy.coo_matrix sparse matrix
+        """
+        coords = [[], []]
+        data = []
+        m = self.mat
+        for i, j in m.keys(): ##m.keys() will return a tuple of two values
+            coords[0].append(i)
+            coords[1].append(j)
+            data.append(m[i, j])
+
+        m = scisp.coo_matrix((data, coords), shape=self.shape, dtype=self.dtype)
+
+        if symm:
+            m += scisp.tril(m.T, k=-1)
+
+        return m.tocoo()
+
+
+
+class ContactMatrix:
+
+    def __init__(self, bam_file, seq_file, viral_file,  path , min_mapq=30, min_len=1000, min_match=30, min_k=5):
+
+        ########input the parameter################
+        ########################################################################
+        ########################################################################
+        #bam_file: alignment info of Hi-C library on contigs in bam#
+        #seq_file: store the assembly contigs in fasta#
+        #min_mapq: minimal mapping quality(default 30)#
+        #min_len: minimal length of contigs(default 1000bp)
+
+        self.bam_file = bam_file
+        self.seq_file = seq_file
+        self.viral_file = viral_file
+        self.path = path
+        self.min_mapq = min_mapq
+        self.min_len = min_len
+        self.min_match = min_match
+
+        #fasta_info store the info from fasta file#
+        #seq_info store the information of contigs from bam file#
+        #cov_info store the information of coverage#
+        #seq_map store the contact map#
+        self.fasta_info = {}
+        self.seq_info = []
+        self.seq_map = None
+        self.map_viral = None
+        self.map_row = None
+        self.viral_same_host = None
+        self.map_combine = None
+        self.viral_info = []
+        self.prokaryotic_info = []
+        
+
+        #total reads: number of alignment in bam file#
+        self.total_reads = None
+        
+        logger.info('Reading fasta file...')
+        with open_input(seq_file) as multi_fasta:
+            # fasta_info is a dictionary of preparation of seq_info and seq_info is the true results
+            # get an estimate of sequences for progress
+            fasta_count = count_fasta_sequences(seq_file)
+            for seqrec in tqdm.tqdm(SeqIO.parse(multi_fasta, 'fasta'), total=fasta_count , desc='Analyzing both prokaryotic and viral contigs in reference fasta'):
+                if len(seqrec) < min_len:
+                    continue
+                self.fasta_info[seqrec.id] = {'length': len(seqrec), 'GC': GC(seqrec.seq)}
+
+        logger.debug('There are totally {} contigs in reference fasta'.format(fasta_count))
+        
+        # now parse the header information from bam file
+        ###########input the bam data###############
+        #########seq_info contain the global contig information and we don't change the seq_info after being created##############
+        with pysam.AlignmentFile(bam_file, 'rb') as bam:
+        ##test that BAM file is the correct sort order
+            if 'SO' not in bam.header['HD'] or bam.header['HD']['SO'] != 'queryname':
+                raise IOError('BAM file must be sorted by read name')
+
+            # determine the set of active sequences
+            # where the first filtration step is by length
+            logger.info('Filtering contigs by minimal length({})...'.format(self.min_len))
+            ref_count = {'seq_missing': 0, 'too_short': 0}
+
+            offset = 0
+            localid = 0
+            
+            for n, (rname, rlen) in enumerate(zip(bam.references, bam.lengths)):
+            # minimum length threshold
+                if rlen < min_len:
+                    ref_count['too_short'] += 1
+                    continue
+
+                try:
+                    fa = self.fasta_info[rname]
+    
+                except KeyError:
+                    logger.info('Contig "{}" was not present in reference fasta'.format(rname))
+                    ref_count['seq_missing'] += 1
+                    #self.missing.append(rname)
+                    continue
+
+                assert fa['length'] == rlen, 'Sequence lengths in {} do not agree: bam {} fasta {}'.format(rname, fa['length'], rlen)
+
+                self.seq_info.append(SeqInfo(localid , n , rname, rlen, fa['GC']))
+                localid = localid + 1
+                offset += rlen
+            
+            self.total_len = offset
+            self.total_seq = localid
+            
+            del self.fasta_info, offset, localid
+
+            if self.total_seq == 0:
+                logger.info('No sequences in BAM found in FASTA')
+                raise ImportError('No sequences in BAM found in FASTA')
+            
+            logger.debug('{} contigs miss and {} contigs are too short'.format(ref_count['seq_missing'] , ref_count['too_short']))
+            logger.debug('Accepted {} contigs covering {} bp'.format(self.total_seq, self.total_len))
+            
+            
+            logger.info('Counting reads in bam file...')
+            self.total_reads = bam.count(until_eof=True)
+            logger.debug('BAM file contains {0} alignments'.format(self.total_reads))
+
+            logger.info('Handling the alignments...')
+            self._bin_map(bam)
+                
+
+        assert self.seq_map.shape[0] == len(self.seq_info), 'Filter error'
+        
+        #########fiter viral contig and corresponding viral contacts#########
+        viral_contig_name = pd.read_csv(viral_file, sep='\t' , header=None).values[: , 0]
+        
+        viral_contig_index = []
+        prokaryotic_contig_index = []
+        #######Distinguish the index of viral contigs and prokaryotic contigs###########
+        for i in range(len(self.seq_info)):
+            contig_temp = self.seq_info[i]
+            if contig_temp.name in viral_contig_name:
+                viral_contig_index.append(i)
+            else:
+                prokaryotic_contig_index.append(i)
+        
+        del contig_temp
+        
+        for i , idn in enumerate(viral_contig_index):
+            viral_seq = self.seq_info[idn]
+            assert viral_seq.localid == idn, 'the local index does not match the contact matrix index'
+            self.viral_info.append(SeqInfo(i , viral_seq.refid , viral_seq.name , viral_seq.length , viral_seq.GC))
+
+        for i , idn in enumerate(prokaryotic_contig_index):
+            prokaryotic_seq = self.seq_info[idn]
+            assert prokaryotic_seq.localid == idn, 'the local index does not match the contact matrix index'
+            self.prokaryotic_info.append(SeqInfo(i , prokaryotic_seq.refid , prokaryotic_seq.name , prokaryotic_seq.length , prokaryotic_seq.GC))
+            
+        del self.seq_info
+        
+        logger.info('There are {} viral contigs'.format(len(viral_contig_index)))
+        logger.info('There are {} prokaryotic contigs'.format(prokaryotic_contig_index))
+        logger.info('Write information of viral contigs and prokaryotic contigs')
+        self._write_viral_info()
+        self._write_prokaryotic_info()
+        
+        
+        self.map_viral = self.seq_map.tocsr()
+        self.map_viral = self.map_viral[viral_contig_index , :]
+        self.map_viral = self.map_viral.tocsc()
+        self.map_viral = self.map_viral[: , viral_contig_index]
+        self.map_viral = self.map_viral.todense()
+        
+        self.map_row = self.seq_map.tocsr()
+        self.map_row = self.map_row[viral_contig_index , :]
+        self.map_row = self.map_row.tocsc()
+        self.map_row = self.map_row[: , prokaryotic_contig_index]
+        self.map_row = self.map_row.tocsr()
+        
+        del self.seq_map
+        
+        numV = len(self.viral_info)
+        numE = np.count_nonzero(self.map_viral)/2
+        
+        for k in range(min_k , 50):
+            count = 0
+            self.viral_same_host = np.zeros((numV,numV))
+            for i in range(numV):
+                for j in range(i+1 , numV):
+                    host_i = self.map_row[i , :].tocoo().col
+                    if host_i.shape[0] < 1:
+                        break
+                    host_j = self.map_row[j , :].tocoo().col
+                    if host_j.shape[0] < 1:
+                        break
+                    if np.intersect1d(host_i , host_j).shape[0] >= k:
+                        count += 1
+                        self.viral_same_host[i , j] = 1
+                        self.viral_same_host[j , i] = 1
+            if count <= numE:
+                break
+        
+        logger.info('the threshold of shared host contig is {}'.format(k))
+        logger.info('there are {} edges in the host proximity graph'.format(count))
+        logger.info('there are {} edges in the Hi-C interaction graph'.format(numE))
+        
+        
+        self.map_combine = np.zeros((numV,numV))
+        map_temp = scisp.coo_matrix(self.map_viral+self.viral_same_host)
+        sources = map_temp.row
+        targets = map_temp.col
+        for i,j in zip(sources , targets):
+            self.map_combine[i , j] = 1
+        
+        logger.info('Integrate the Hi-C interaction graph and the host proximity graph')
+        logger.info('Integrative graph construction finished and there are {} edges in the integrative graph'.format(np.count_nonzero(self.map_combine)/2))
+        del self.map_viral, self.map_row, self.prokaryotic_contig_index, self.viral_same_host, map_temp
+        
+        
+    def _bin_map(self, bam):
+        """
+        Accumulate read-pair observations from the supplied BAM file.
+        Maps are initialized here. Logical control is achieved through initialisation of the
+        ContactMap instance, rather than supplying this function arguments.
+
+        :param bam: this instance's open bam file.
+        """
+        import tqdm
+
+        def _simple_match(r):
+            return r.mapping_quality >= _mapq
+
+        def _strong_match(r):
+            if r.mapping_quality < _mapq or r.cigarstring is None:
+                return False
+            cig = r.cigartuples[-1] if r.is_reverse else r.cigartuples[0]
+            return cig[0] == 0 and cig[1] >= self.min_match
+
+        # set-up match call
+        _matcher = _strong_match if self.min_match else _simple_match
+
+        def next_informative(_bam_iter, _pbar):
+            while True:
+                r = next(_bam_iter)
+                _pbar.update()
+                if not r.is_unmapped and not r.is_secondary and not r.is_supplementary:
+                    return r
+       
+        _seq_map = Sparse2DAccumulator(self.total_seq)
+
+
+        with tqdm.tqdm(total=self.total_reads) as pbar:
+
+            # locals for read filtering
+            _mapq = self.min_mapq
+
+            _idx = self.make_reverse_index('refid') #from global index to local index#
+            _len = bam.lengths
+     
+            counts = OrderedDict({
+                'accepted pairs': 0,
+                'map_same_contig pairs': 0,
+                'ref_excluded pairs': 0,
+                'poor_match pairs': 0,
+                'single read':0})
+
+            bam.reset()
+            bam_iter = bam.fetch(until_eof=True)
+            self.index1 = 0
+            while True:
+                self.index1 += 1
+                try:
+                    r1 = next_informative(bam_iter, pbar)
+                    while True:
+                        # read records until we get a pair
+                        r2 = next_informative(bam_iter, pbar)
+                        if r1.query_name == r2.query_name:
+                            break
+                        r1 = r2 ###if we don't get a pair, next _bam_iter
+                        counts['single read'] += 1
+                        
+                except StopIteration:
+                    break
+
+                if r1.reference_id not in _idx or r2.reference_id not in _idx:
+                    counts['ref_excluded pairs'] += 1
+                    continue
+
+                if r1.reference_id == r2.reference_id:
+                    counts['map_same_contig pairs'] += 1
+                    continue
+
+                if not _matcher(r1) or not _matcher(r2):
+                    counts['poor_match pairs'] += 1
+                    continue
+
+                # get internal indices
+                ix1 = _idx[r1.reference_id]
+                ix2 = _idx[r2.reference_id]
+
+                # maintain just a half-matrix
+                if ix2 < ix1:
+                    ix1, ix2 = ix2, ix1
+
+                counts['accepted pairs'] += 1
+                ix = (ix1 , ix2)
+                if _seq_map.getitem(ix):
+                    temp_value = _seq_map.getitem(ix) + 1
+                    _seq_map.setitem(ix , temp_value)
+                else:
+                    _seq_map.setitem(ix , 1)
+                
+        self.seq_map = _seq_map.get_coo()
+        del _seq_map, r1, r2, _idx
+
+        logger.debug('Pair accounting: {}'.format(counts))
+
+
+    def make_reverse_index(self, field_name):
+        """
+        Make a reverse look-up (dict) from the chosen field in seq_info to the internal index value
+        of the given sequence. Non-unique fields will raise an exception.
+
+        :param field_name: the seq_info field to use as the reverse.
+        :return: internal array index of the sequence
+        """
+        rev_idx = {}
+        for n, seq in enumerate(self.seq_info):
+            fv = getattr(seq, field_name)
+            if fv in rev_idx:
+                raise RuntimeError('field contains non-unique entries, a 1-1 mapping cannot be made')
+            rev_idx[fv] = n
+        return rev_idx
+
+
+
+    def _write_prokaryotic_info(self):
+        with open(os.path.join(self.path , 'prokaryotic_contig_info.csv'),'w') as out:
+            for seq in self.prokaryotic_info:
+                out.write(str(seq.name)+ ',' +str(seq.length)+ ',' +str(seq.GC))
+                out.write('\n')
+                
+    def _write_viral_info(self):
+        with open(os.path.join(self.path , 'viral_contig_info.csv'),'w') as out:
+            for seq in self.viral_info:
+                out.write(str(seq.name)+ ',' +str(seq.length)+ ',' +str(seq.GC))
+                out.write('\n')
+
+
diff --git a/exceptions.py b/exceptions.py
new file mode 100644
index 0000000..a2d0e77
--- /dev/null
+++ b/exceptions.py
@@ -0,0 +1,52 @@
+
+class ApplicationException(Exception):
+    def __init__(self, message):
+        super(ApplicationException, self).__init__(message)
+
+
+class UnknownEnzymeException(ApplicationException):
+    """All sequences were excluded during filtering"""
+    def __init__(self, target, similar):
+        super(UnknownEnzymeException, self).__init__(
+            '{} is undefined, but its similar to: {}'.format(target, ', '.join(similar)))
+
+
+class UnknownOrientationStateException(ApplicationException):
+    """All sequences were excluded during filtering"""
+    def __init__(self, ori):
+        super(UnknownOrientationStateException, self).__init__('unknown orientation state [{}].'.format(ori))
+
+
+class NoneAcceptedException(ApplicationException):
+    """All sequences were excluded during filtering"""
+    def __init__(self):
+        super(NoneAcceptedException, self).__init__('all sequences were excluded')
+
+
+class TooFewException(ApplicationException):
+    """Method requires a minimum of nodes"""
+    def __init__(self, minseq, method):
+        super(TooFewException, self).__init__('More than {} sequences are required to apply {}'.format(minseq, method))
+
+
+class NoRemainingClustersException(ApplicationException):
+    def __init__(self, msg):
+        super(NoRemainingClustersException, self).__init__(msg)
+
+
+class NoReportException(ApplicationException):
+    """Clustering does not contain a report"""
+    def __init__(self, clid):
+        super(NoReportException, self).__init__('No clusters contained an ordering'.format(clid))
+
+
+class ZeroLengthException(ApplicationException):
+    """Sequence of zero length"""
+    def __init__(self, seq_name):
+        super(ZeroLengthException, self).__init__('Sequence [{}] has zero length'.format(seq_name))
+
+
+class ParsingError(ApplicationException):
+    """An error during input parsing"""
+    def __init__(self, msg):
+        super(ParsingError, self).__init__(msg)
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..0a4aa0e
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,109 @@
+import gzip
+import io
+import os
+import sys
+import subprocess
+
+
+def make_dir(path, exist_ok=False):
+    """
+    Convenience method for making directories with a standard logic.
+    An exception is raised when the specified path exists and is not a directory.
+    :param path: target path to create
+    :param exist_ok: if true, an existing directory is ok. Existing files will still cause an exception
+    """
+    if not os.path.exists(path):
+        os.mkdir(path)
+    elif not exist_ok:
+        raise IOError('output directory already exists!')
+    elif os.path.isfile(path):
+        raise IOError('output path already exists and is a file!')
+
+
+
+
+def count_fasta_sequences(file_name):
+    """
+    Estimate the number of fasta sequences in a file by counting headers. Decompression is automatically attempted
+    for files ending in .gz. Counting and decompression is by why of subprocess calls to grep and gzip. Uncompressed
+    files are also handled. This is about 8 times faster than parsing a file with BioPython and 6 times faster
+    than reading all lines in Python.
+
+    :param file_name: the fasta file to inspect
+    :return: the estimated number of records
+    """
+    if file_name.endswith('.gz'):
+        proc_uncomp = subprocess.Popen(['gzip', '-cd', file_name], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        proc_read = subprocess.Popen(['grep', r'^>'], stdin=proc_uncomp.stdout, stdout=subprocess.PIPE)
+    else:
+        proc_read = subprocess.Popen(['grep', r'^>', file_name], stdout=subprocess.PIPE)
+
+    n = 0
+    for _ in proc_read.stdout:
+        n += 1
+    return n
+
+
+def gen_bins(fastafile,resultfile,outputdir):
+    # read fasta file
+    sequences={}
+    if fastafile.endswith("gz"):
+        with gzip.open(fastafile,'r') as f:
+            for line in f:
+                line=str(line,encoding="utf-8")
+                if line.startswith(">"):
+                    if " " in line:
+                        seq,others=line.split(' ', 1)
+                        sequences[seq] = ""
+                    else :
+                        seq=line.rstrip("\n")
+                        sequences[seq] = ""
+                else:
+                    sequences[seq] += line.rstrip("\n")
+    else:
+        with open(fastafile,'r') as f:
+            for line in f:
+                if line.startswith(">"):
+                    if " " in line:
+                        seq,others=line.split(' ', 1)
+                        sequences[seq] = ""
+                    else :
+                        seq=line.rstrip("\n")
+                        sequences[seq] = ""
+                else:
+                    sequences[seq] += line.rstrip("\n")
+    dic={}
+    with open(resultfile,"r") as f:
+        for line in f:
+            contig_name,cluster_name=line.strip().split('\t')
+            try:
+                dic[cluster_name].append(contig_name)
+            except:
+                dic[cluster_name]=[]
+                dic[cluster_name].append(contig_name)
+    print("Writing bins in \t{}".format(outputdir))
+    if not os.path.exists(outputdir):
+        os.makedirs(outputdir)
+    
+    bin_name=0
+    for _,cluster in dic.items():
+        if bin_name < 10:
+            bin = 'VIRAL_BIN'+ '000' + str(bin_name) + '.fa'
+        elif bin_name >= 10 and bin_name < 100:
+            bin = 'VIRAL_BIN'+ '00' + str(bin_name) + '.fa'
+        elif bin_name >= 100 and bin_name < 1000:
+            bin = 'VIRAL_BIN'+ '0' + str(bin_name) + '.fa'
+        else:
+            bin = 'VIRAL_BIN'+str(bin_name) + '.fa'
+        binfile=os.path.join(outputdir,"{}".format(bin))
+        with open(binfile,"w") as f:
+            for contig_name in cluster:
+                contig_name=">"+contig_name
+                try:
+                    sequence=sequences[contig_name]
+                except:
+                    continue
+                f.write(contig_name+"\n")
+                f.write(sequence+"\n")
+        bin_name+=1
+
diff --git a/viralcc.py b/viralcc.py
new file mode 100644
index 0000000..9dee0c6
--- /dev/null
+++ b/viralcc.py
@@ -0,0 +1,163 @@
+#########The structure of the main script is modified from bin3C########
+from construct_graph import ContactMatrix
+from bin import ClusterBin
+from exceptions import ApplicationException
+from utils import make_dir, gen_bins
+import logging
+import sys
+import argparse
+import os
+import time
+import warnings
+
+##Ignore the warning information of package deprecation##
+warnings.filterwarnings("ignore")
+
+__version__ = '1.0.0, released at 03/2022'
+
+if __name__ == '__main__':
+    
+    def mk_version():
+        return 'ViralCC v{}'.format(__version__)
+
+    def out_name(base, suffix):
+        return '{}{}'.format(base, suffix)
+
+    def ifelse(arg, default):
+        if arg is None:
+            return default
+        else:
+            return arg
+
+    runtime_defaults = {
+        'min_len': 3000,
+        'min_mapq': 30,
+        'min_match': 30,
+        'min_k': 5,
+        'random_seed': 42
+    }
+
+    global_parser = argparse.ArgumentParser(add_help=False)
+    global_parser.add_argument('-V', '--version', default=False, action='store_true', help='Show the application version')
+    global_parser.add_argument('-v', '--verbose', default=False, action='store_true', help='Verbose output')
+    global_parser.add_argument('--cover', default=False, action='store_true', help='Cover existing files')
+    global_parser.add_argument('--log', help='Log file path [OUTDIR/viralcc.log]')
+
+
+    parser = argparse.ArgumentParser(description='ViralCC: a metagenomic proximity-based tool to retrieve complete viral genomes')
+
+    subparsers = parser.add_subparsers(title='commands', dest='command', description='Valid commands',
+                                       help='choose an analysis stage for further options')
+
+    cmd_pl = subparsers.add_parser('pipeline', parents=[global_parser],
+                                      description='Normalize contacts and do the binning.')
+
+    cmd_test = subparsers.add_parser('test', parents=[global_parser],
+                                        description='pipeline testing.')
+
+    '''
+    pipeline subparser input
+    '''
+    cmd_pl.add_argument('--min-len', type=int,
+                               help='Minimum acceptable reference length [3000]')
+    cmd_pl.add_argument('--min-mapq', type=int,
+                           help='Minimum acceptable mapping quality [30]')
+    cmd_pl.add_argument('--min-match', type=int,
+                           help='Accepted alignments must being N matches [30]')
+    cmd_pl.add_argument('--min-k', type=int,
+                           help='Lower bound of k for determining host poximity graph [5]')
+    cmd_pl.add_argument('--random-seed', type=int,
+                           help='Random seed to run the Leiden algorithm [42]')
+    cmd_pl.add_argument('FASTA', help='Reference fasta sequence')
+    cmd_pl.add_argument('BAM', help='Input bam file in query order')
+    cmd_pl.add_argument('VIRAL', help='Viral contig labels')
+    cmd_pl.add_argument('OUTDIR', help='Output directory')
+           
+
+    '''
+    Testing of HiCBin software
+    '''
+    cmd_test.add_argument('OUTDIR', help='Output directory of testing results')  
+
+    args = parser.parse_args()
+
+    if args.version:
+        print(mk_version())
+        sys.exit(0)
+
+    try:
+        make_dir(args.OUTDIR, args.cover)
+    except IOError:
+        print('Error: cannot find out directory or the directory already exists')
+        sys.exit(1)
+
+    logging.captureWarnings(True)
+    logger = logging.getLogger('main')
+
+    # root log listens to everything
+    root = logging.getLogger('')
+    root.setLevel(logging.DEBUG)
+
+    # log message format
+    formatter = logging.Formatter(fmt='%(levelname)-8s | %(asctime)s | %(name)7s | %(message)s')
+
+    # Runtime console listens to INFO by default
+    ch = logging.StreamHandler()
+    if args.verbose:
+        ch.setLevel(logging.DEBUG)
+    else:
+        ch.setLevel(logging.INFO)
+    ch.setFormatter(formatter)
+    root.addHandler(ch)
+
+    # File log listens to all levels from root
+    if args.log is not None:
+        log_path = args.log
+    else:
+        log_path = os.path.join(args.OUTDIR, 'hicbin.log')
+    fh = logging.FileHandler(log_path, mode='a')
+    fh.setLevel(logging.DEBUG)
+    fh.setFormatter(formatter)
+    root.addHandler(fh)
+
+    # Add some environmental details
+    logger.debug(mk_version())
+    logger.debug(sys.version.replace('\n', ' '))
+    logger.debug('Command line: {}'.format(' '.join(sys.argv)))
+
+    try:
+        if args.command == 'pipeline':
+            start_time = time.time()
+            cm = ContactMatrix(args.BAM,
+                                args.FASTA,
+                                args.VIRAL,
+                                args.OUTDIR,
+                                min_mapq=ifelse(args.min_mapq, runtime_defaults['min_mapq']),
+                                min_len=ifelse(args.min_len, runtime_defaults['min_len']),
+                                min_match=ifelse(args.min_match, runtime_defaults['min_match']),
+                                min_k=ifelse(args.min_k, runtime_defaults['min_k']))
+
+
+            cl = ClusterBin(args.OUTDIR,
+                            cm.viral_info ,
+                            cm.map_combine ,
+                            random_seed = ifelse(args.random_seed, runtime_defaults['random_seed']))
+                            
+            logger.info('Clustering fininshed')
+            logger.info('Writing bins...')
+            gen_bins(args.FASTA , os.path.join(args.OUTDIR ,'cluster_viral_contig.txt') , os.path.join(args.OUTDIR ,'VIRAL_BIN'))
+            
+            end_time = time.time()
+            logger.info('ViralCC consumes {} seconds in total'.format(str(end_time-start_time)))
+
+
+        if args.command == 'test':
+            logger.info('Begin to test the ViralCC software...')
+
+
+
+
+    except ApplicationException:
+        logger.error('ApplicationException Error')
+        sys.exit(1)
+
diff --git a/viralcc_linux_env.yaml b/viralcc_linux_env.yaml
new file mode 100644
index 0000000..f781fb2
--- /dev/null
+++ b/viralcc_linux_env.yaml
@@ -0,0 +1,136 @@
+name: HiCBin_env
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+  - r
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - _r-mutex=1.0.1
+  - binutils_impl_linux-64=2.35.1
+  - binutils_linux-64=2.35
+  - biopython=1.78
+  - bwidget=1.9.14
+  - bzip2=1.0.8
+  - c-ares=1.17.1
+  - ca-certificates=2021.4.13
+  - cairo=1.16.0
+  - certifi=2020.12.5
+  - cffi=1.14.5
+  - clang=11.0.1
+  - clang-11=11.0.1
+  - clangxx=11.0.1
+  - compiler-rt=11.0.1
+  - compiler-rt_linux-64=11.0.1
+  - curl=7.71.1
+  - fontconfig=2.13.1
+  - freetype=2.10.4
+  - fribidi=1.0.10
+  - gcc_impl_linux-64=9.3.0
+  - gcc_linux-64=9.3.0
+  - gettext=0.19.8.1
+  - gfortran_impl_linux-64=9.3.0
+  - gfortran_linux-64=9.3.0
+  - gmp=6.2.1
+  - graphite2=1.3.14
+  - gsl=2.6
+  - gxx_impl_linux-64=9.3.0
+  - gxx_linux-64=9.3.0
+  - harfbuzz=2.7.4
+  - icu=68.1
+  - isl=0.22.1
+  - jinja2=2.11.3
+  - joblib=1.0.1
+  - jpeg=9d
+  - kernel-headers_linux-64=2.6.32
+  - krb5=1.17.2
+  - ld_impl_linux-64=2.35.1
+  - ldid=2.1.2
+  - leidenalg=0.8.3
+  - libblas=3.9.0
+  - libcblas=3.9.0
+  - libclang-cpp11=11.0.1
+  - libcurl=7.71.1
+  - libcxx=11.0.1
+  - libcxxabi=11.0.1
+  - libdeflate=1.0
+  - libedit=3.1.20191231
+  - libev=4.33
+  - libffi=3.3
+  - libgcc-devel_linux-64=9.3.0
+  - libgcc-ng=9.3.0
+  - libgfortran=3.0.0
+  - libgfortran-ng=9.3.0
+  - libgfortran5=9.3.0
+  - libglib=2.66.7
+  - libgomp=9.3.0
+  - libiconv=1.16
+  - liblapack=3.9.0
+  - libllvm11=11.0.1
+  - libnghttp2=1.43.0
+  - libopenblas=0.3.12
+  - libpng=1.6.37
+  - libssh2=1.9.0
+  - libstdcxx-devel_linux-64=9.3.0
+  - libstdcxx-ng=9.3.0
+  - libtiff=4.2.0
+  - libuuid=2.32.1
+  - libwebp-base=1.2.0
+  - libxcb=1.14
+  - libxml2=2.9.10
+  - llvm-openmp=11.0.1
+  - llvm-tools=11.0.1
+  - lz4-c=1.9.3
+  - make=4.3
+  - markupsafe=1.1.1
+  - mpc=1.1.0
+  - mpfr=4.0.2
+  - ncurses=6.2
+  - numpy=1.20.1
+  - openssl=1.1.1k
+  - pandas=1.2.2
+  - pango=1.42.4
+  - pcre=8.44
+  - pcre2=10.36
+  - pip=21.0.1
+  - pixman=0.40.0
+  - psutil=5.8.0
+  - pycairo=1.20.0
+  - pycparser=2.20
+  - pysam=0.15.3
+  - python=3.7.10
+  - python-dateutil=2.8.1
+  - python-igraph=0.8.3
+  - python_abi=3.7
+  - pytz=2021.1
+  - readline=8.1
+  - scikit-learn=0.24.1
+  - scipy=1.6.0
+  - sed=4.8
+  - setuptools=52.0.0
+  - simplegeneric=0.8.1
+  - six=1.15.0
+  - sqlite=3.34.0
+  - sysroot_linux-64=2.12
+  - tapi=1100.0.11
+  - texttable=1.6.3
+  - threadpoolctl=2.1.0
+  - tk=8.6.10
+  - tktable=2.10
+  - tqdm=4.58.0
+  - tzlocal=2.1
+  - wheel=0.36.2
+  - xorg-kbproto=1.0.7
+  - xorg-libice=1.0.10
+  - xorg-libsm=1.2.3
+  - xorg-libx11=1.7.0
+  - xorg-libxext=1.3.4
+  - xorg-libxrender=0.9.10
+  - xorg-libxt=1.2.1
+  - xorg-renderproto=0.11.1
+  - xorg-xextproto=7.3.0
+  - xorg-xproto=7.0.31
+  - xz=5.2.5
+  - zlib=1.2.11
+  - zstd=1.4.8
diff --git a/viralcc_osx_env.yaml b/viralcc_osx_env.yaml
new file mode 100644
index 0000000..0cf8395
--- /dev/null
+++ b/viralcc_osx_env.yaml
@@ -0,0 +1,107 @@
+name: HiCBin_env
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+  - r
+dependencies:
+  - _r-mutex=1.0.1
+  - biopython=1.78
+  - bwidget=1.9.14
+  - bzip2=1.0.8
+  - c-ares=1.17.1
+  - ca-certificates=2021.1.19
+  - cairo=1.16.0
+  - cctools_osx-64=949.0.1
+  - certifi=2020.12.5
+  - cffi=1.14.5
+  - clang=11.0.1
+  - clang-11=11.0.1
+  - clang_osx-64=11.0.1
+  - clangxx=11.0.1
+  - clangxx_osx-64=11.0.1
+  - compiler-rt=11.0.1
+  - compiler-rt_osx-64=11.0.1
+  - curl=7.71.1
+  - fontconfig=2.13.1
+  - freetype=2.10.4
+  - fribidi=1.0.10
+  - gettext=0.19.8.1
+  - gfortran_impl_osx-64=9.3.0
+  - gfortran_osx-64=9.3.0
+  - gmp=6.2.1
+  - graphite2=1.3.14
+  - gsl=2.6
+  - harfbuzz=2.7.4
+  - icu=68.1
+  - isl=0.22.1
+  - jinja2=2.11.3
+  - joblib=1.0.1
+  - jpeg=9d
+  - krb5=1.17.2
+  - ld64_osx-64=530
+  - ldid=2.1.2
+  - leidenalg=0.8.3
+  - libblas=3.9.0
+  - libcblas=3.9.0
+  - libclang-cpp11=11.0.1
+  - libcurl=7.71.1
+  - libcxx=11.0.1
+  - libdeflate=1.0
+  - libedit=3.1.20191231
+  - libev=4.33
+  - libffi=3.3
+  - libglib=2.66.7
+  - libiconv=1.16
+  - liblapack=3.9.0
+  - libllvm11=11.0.1
+  - libnghttp2=1.43.0
+  - libopenblas=0.3.12
+  - libpng=1.6.37
+  - libssh2=1.9.0
+  - libtiff=4.2.0
+  - libwebp-base=1.2.0
+  - libxml2=2.9.10
+  - llvm-openmp=11.0.1
+  - llvm-tools=11.0.1
+  - lz4-c=1.9.3
+  - make=4.3
+  - markupsafe=1.1.1
+  - mpc=1.1.0
+  - mpfr=4.0.2
+  - ncurses=6.2
+  - numpy=1.20.1
+  - openssl=1.1.1j
+  - pandas=1.2.2
+  - pango=1.42.4
+  - pcre=8.44
+  - pcre2=10.36
+  - pip=21.0.1
+  - pixman=0.40.0
+  - psutil=5.8.0
+  - pycairo=1.20.0
+  - pycparser=2.20
+  - pysam=0.15.3
+  - python=3.7.10
+  - python-dateutil=2.8.1
+  - python-igraph=0.8.3
+  - python_abi=3.7
+  - pytz=2021.1
+  - readline=8.1
+  - scikit-learn=0.24.1
+  - scipy=1.6.0
+  - setuptools=52.0.0
+  - simplegeneric=0.8.1
+  - six=1.15.0
+  - sqlite=3.34.0
+  - tapi=1100.0.11
+  - texttable=1.6.3
+  - threadpoolctl=2.1.0
+  - tk=8.6.10
+  - tktable=2.10
+  - tqdm=4.58.0
+  - tzlocal=2.1
+  - wheel=0.36.2
+  - xz=5.2.5
+  - zlib=1.2.11
+  - zstd=1.4.8