From f68dd840f043d712c9282cd3c7b656b51aa0a043 Mon Sep 17 00:00:00 2001 From: Yuxuan Du <55218554+dyxstat@users.noreply.github.com> Date: Sat, 5 Mar 2022 14:17:01 -0800 Subject: [PATCH] Add files via upload --- bin.py | 78 ++++++++ construct_graph.py | 398 +++++++++++++++++++++++++++++++++++++++++ exceptions.py | 52 ++++++ utils.py | 109 +++++++++++ viralcc.py | 163 +++++++++++++++++ viralcc_linux_env.yaml | 136 ++++++++++++++ viralcc_osx_env.yaml | 107 +++++++++++ 7 files changed, 1043 insertions(+) create mode 100644 bin.py create mode 100644 construct_graph.py create mode 100644 exceptions.py create mode 100644 utils.py create mode 100644 viralcc.py create mode 100644 viralcc_linux_env.yaml create mode 100644 viralcc_osx_env.yaml diff --git a/bin.py b/bin.py new file mode 100644 index 0000000..70ab57d --- /dev/null +++ b/bin.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# coding: utf-8 + +import numpy as np +import scipy.sparse as scisp +from math import log,exp,sqrt +import logging +import igraph as ig +import leidenalg +from sklearn import silhouette_score +import os + + +# package logger +logger = logging.getLogger(__name__) + + +class ClusterBin: + def __init__(self, path , viral_info , map_combine , random_seed = 42): + ''' + perc: threshold of spurious contacts + viral_info: viral information + min_combine: integrative graph + ''' + + #information of the viral contigs + self.path = path + self.viral_name = [] + for i in range(len(viral_info)): + temp = viral_info[i] + self.viral_name.append(temp.name) + + + ###########use leiden algorithm to do clustering######### + self.map_combine = scisp.coo_matrix(map_combine) + + vcount = self.map_combine.shape[0] + sources = self.map_combine.row + targets = self.map_combine.col + index = sources>targets + sources = sources[index] + targets = targets[index] + edgelist = list(zip(sources, targets)) + g = ig.Graph(vcount, edgelist) + + SIL_score = [] + cluster_range = np.arange(2,50) + for n_clusters in cluster_range: + part = leidenalg.find_partition(g , leidenalg.RBConfigurationVertexPartition , resolution_parameter = n_clusters , n_iterations= -1 , seed = random_seed ) + part = list(part) + + label_pred = np.ones(self.map_combine.shape[0]) + for ci in range(len(part)): + for contig in part[ci]: + label_pred[contig] = ci + + SIL_score.append(silhouette_score(self.map_combine.todense() , np.array(label_pred))) + + optimal = SIL_score.index(max(SIL_score)) + part = leidenalg.find_partition(g , leidenalg.RBConfigurationVertexPartition, resolution_parameter = cluster_range[optimal] , n_iterations= -1 , seed = random_seed) + part = list(part) + logger.info('the number of generated viral bins is {}'.format(len(part))) + + self.dist_cluster = {} + for ci in range(len(part)): + for id in part[ci]: + self.dist_cluster[self.viral_name[id]] = 'group'+str(ci) + + self._write_cluster() + + + + def _write_cluster(self): + ########create file for checkm################ + with open(os.path.join(self.path ,'cluster_viral_contig.txt'),'w') as out: + for key , value in self.dist_cluster.items(): + out.write(str(key)+ '\t' +str(value)) + out.write('\n') diff --git a/construct_graph.py b/construct_graph.py new file mode 100644 index 0000000..36c13ba --- /dev/null +++ b/construct_graph.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python +# coding: utf-8 + +from collections import OrderedDict, namedtuple +import Bio.SeqIO as SeqIO +from Bio.SeqUtils import GC +from itertools import combinations +from math import exp, log, sqrt +import pandas as pd +import numpy as np +import pysam +import scipy.sparse as scisp +import tqdm +import csv +import os +from utils import count_fasta_sequences, open_input +import logging + +# package logger +logger = logging.getLogger(__name__) + +#######offset is the enumeration of the length###################### +#localid is the index of the list# +#refid is the index of the fasta file, which is a global index# +SeqInfo = namedtuple('SeqInfo', ['localid', 'refid', 'name', 'length', 'GC']) #create a new class of tuple: SeqInfo + + + +class Sparse2DAccumulator(object): +######create a 2D coo sparse matrix########### + def __init__(self, N): + self.shape = (N, N) + self.mat = {} + ##mat is a dictionary here + # fixed counting type + self.dtype = np.uint32 + + def setitem(self, index, value): + assert len(index) == 2 and index[0] >= 0 and index[1] >= 0, 'invalid index: {}'.format(index) + assert isinstance(value, (int, np.int)), 'values must be integers' + self.mat[index] = value + + def getitem(self, index): + if index in self.mat: + return self.mat[index] + else: + return 0 + + def get_coo(self, symm=True): + """ + Create a COO format sparse representation of the accumulated values. + + :param symm: ensure matrix is symmetric on return + :return: a scipy.coo_matrix sparse matrix + """ + coords = [[], []] + data = [] + m = self.mat + for i, j in m.keys(): ##m.keys() will return a tuple of two values + coords[0].append(i) + coords[1].append(j) + data.append(m[i, j]) + + m = scisp.coo_matrix((data, coords), shape=self.shape, dtype=self.dtype) + + if symm: + m += scisp.tril(m.T, k=-1) + + return m.tocoo() + + + +class ContactMatrix: + + def __init__(self, bam_file, seq_file, viral_file, path , min_mapq=30, min_len=1000, min_match=30, min_k=5): + + ########input the parameter################ + ######################################################################## + ######################################################################## + #bam_file: alignment info of Hi-C library on contigs in bam# + #seq_file: store the assembly contigs in fasta# + #min_mapq: minimal mapping quality(default 30)# + #min_len: minimal length of contigs(default 1000bp) + + self.bam_file = bam_file + self.seq_file = seq_file + self.viral_file = viral_file + self.path = path + self.min_mapq = min_mapq + self.min_len = min_len + self.min_match = min_match + + #fasta_info store the info from fasta file# + #seq_info store the information of contigs from bam file# + #cov_info store the information of coverage# + #seq_map store the contact map# + self.fasta_info = {} + self.seq_info = [] + self.seq_map = None + self.map_viral = None + self.map_row = None + self.viral_same_host = None + self.map_combine = None + self.viral_info = [] + self.prokaryotic_info = [] + + + #total reads: number of alignment in bam file# + self.total_reads = None + + logger.info('Reading fasta file...') + with open_input(seq_file) as multi_fasta: + # fasta_info is a dictionary of preparation of seq_info and seq_info is the true results + # get an estimate of sequences for progress + fasta_count = count_fasta_sequences(seq_file) + for seqrec in tqdm.tqdm(SeqIO.parse(multi_fasta, 'fasta'), total=fasta_count , desc='Analyzing both prokaryotic and viral contigs in reference fasta'): + if len(seqrec) < min_len: + continue + self.fasta_info[seqrec.id] = {'length': len(seqrec), 'GC': GC(seqrec.seq)} + + logger.debug('There are totally {} contigs in reference fasta'.format(fasta_count)) + + # now parse the header information from bam file + ###########input the bam data############### + #########seq_info contain the global contig information and we don't change the seq_info after being created############## + with pysam.AlignmentFile(bam_file, 'rb') as bam: + ##test that BAM file is the correct sort order + if 'SO' not in bam.header['HD'] or bam.header['HD']['SO'] != 'queryname': + raise IOError('BAM file must be sorted by read name') + + # determine the set of active sequences + # where the first filtration step is by length + logger.info('Filtering contigs by minimal length({})...'.format(self.min_len)) + ref_count = {'seq_missing': 0, 'too_short': 0} + + offset = 0 + localid = 0 + + for n, (rname, rlen) in enumerate(zip(bam.references, bam.lengths)): + # minimum length threshold + if rlen < min_len: + ref_count['too_short'] += 1 + continue + + try: + fa = self.fasta_info[rname] + + except KeyError: + logger.info('Contig "{}" was not present in reference fasta'.format(rname)) + ref_count['seq_missing'] += 1 + #self.missing.append(rname) + continue + + assert fa['length'] == rlen, 'Sequence lengths in {} do not agree: bam {} fasta {}'.format(rname, fa['length'], rlen) + + self.seq_info.append(SeqInfo(localid , n , rname, rlen, fa['GC'])) + localid = localid + 1 + offset += rlen + + self.total_len = offset + self.total_seq = localid + + del self.fasta_info, offset, localid + + if self.total_seq == 0: + logger.info('No sequences in BAM found in FASTA') + raise ImportError('No sequences in BAM found in FASTA') + + logger.debug('{} contigs miss and {} contigs are too short'.format(ref_count['seq_missing'] , ref_count['too_short'])) + logger.debug('Accepted {} contigs covering {} bp'.format(self.total_seq, self.total_len)) + + + logger.info('Counting reads in bam file...') + self.total_reads = bam.count(until_eof=True) + logger.debug('BAM file contains {0} alignments'.format(self.total_reads)) + + logger.info('Handling the alignments...') + self._bin_map(bam) + + + assert self.seq_map.shape[0] == len(self.seq_info), 'Filter error' + + #########fiter viral contig and corresponding viral contacts######### + viral_contig_name = pd.read_csv(viral_file, sep='\t' , header=None).values[: , 0] + + viral_contig_index = [] + prokaryotic_contig_index = [] + #######Distinguish the index of viral contigs and prokaryotic contigs########### + for i in range(len(self.seq_info)): + contig_temp = self.seq_info[i] + if contig_temp.name in viral_contig_name: + viral_contig_index.append(i) + else: + prokaryotic_contig_index.append(i) + + del contig_temp + + for i , idn in enumerate(viral_contig_index): + viral_seq = self.seq_info[idn] + assert viral_seq.localid == idn, 'the local index does not match the contact matrix index' + self.viral_info.append(SeqInfo(i , viral_seq.refid , viral_seq.name , viral_seq.length , viral_seq.GC)) + + for i , idn in enumerate(prokaryotic_contig_index): + prokaryotic_seq = self.seq_info[idn] + assert prokaryotic_seq.localid == idn, 'the local index does not match the contact matrix index' + self.prokaryotic_info.append(SeqInfo(i , prokaryotic_seq.refid , prokaryotic_seq.name , prokaryotic_seq.length , prokaryotic_seq.GC)) + + del self.seq_info + + logger.info('There are {} viral contigs'.format(len(viral_contig_index))) + logger.info('There are {} prokaryotic contigs'.format(prokaryotic_contig_index)) + logger.info('Write information of viral contigs and prokaryotic contigs') + self._write_viral_info() + self._write_prokaryotic_info() + + + self.map_viral = self.seq_map.tocsr() + self.map_viral = self.map_viral[viral_contig_index , :] + self.map_viral = self.map_viral.tocsc() + self.map_viral = self.map_viral[: , viral_contig_index] + self.map_viral = self.map_viral.todense() + + self.map_row = self.seq_map.tocsr() + self.map_row = self.map_row[viral_contig_index , :] + self.map_row = self.map_row.tocsc() + self.map_row = self.map_row[: , prokaryotic_contig_index] + self.map_row = self.map_row.tocsr() + + del self.seq_map + + numV = len(self.viral_info) + numE = np.count_nonzero(self.map_viral)/2 + + for k in range(min_k , 50): + count = 0 + self.viral_same_host = np.zeros((numV,numV)) + for i in range(numV): + for j in range(i+1 , numV): + host_i = self.map_row[i , :].tocoo().col + if host_i.shape[0] < 1: + break + host_j = self.map_row[j , :].tocoo().col + if host_j.shape[0] < 1: + break + if np.intersect1d(host_i , host_j).shape[0] >= k: + count += 1 + self.viral_same_host[i , j] = 1 + self.viral_same_host[j , i] = 1 + if count <= numE: + break + + logger.info('the threshold of shared host contig is {}'.format(k)) + logger.info('there are {} edges in the host proximity graph'.format(count)) + logger.info('there are {} edges in the Hi-C interaction graph'.format(numE)) + + + self.map_combine = np.zeros((numV,numV)) + map_temp = scisp.coo_matrix(self.map_viral+self.viral_same_host) + sources = map_temp.row + targets = map_temp.col + for i,j in zip(sources , targets): + self.map_combine[i , j] = 1 + + logger.info('Integrate the Hi-C interaction graph and the host proximity graph') + logger.info('Integrative graph construction finished and there are {} edges in the integrative graph'.format(np.count_nonzero(self.map_combine)/2)) + del self.map_viral, self.map_row, self.prokaryotic_contig_index, self.viral_same_host, map_temp + + + def _bin_map(self, bam): + """ + Accumulate read-pair observations from the supplied BAM file. + Maps are initialized here. Logical control is achieved through initialisation of the + ContactMap instance, rather than supplying this function arguments. + + :param bam: this instance's open bam file. + """ + import tqdm + + def _simple_match(r): + return r.mapping_quality >= _mapq + + def _strong_match(r): + if r.mapping_quality < _mapq or r.cigarstring is None: + return False + cig = r.cigartuples[-1] if r.is_reverse else r.cigartuples[0] + return cig[0] == 0 and cig[1] >= self.min_match + + # set-up match call + _matcher = _strong_match if self.min_match else _simple_match + + def next_informative(_bam_iter, _pbar): + while True: + r = next(_bam_iter) + _pbar.update() + if not r.is_unmapped and not r.is_secondary and not r.is_supplementary: + return r + + _seq_map = Sparse2DAccumulator(self.total_seq) + + + with tqdm.tqdm(total=self.total_reads) as pbar: + + # locals for read filtering + _mapq = self.min_mapq + + _idx = self.make_reverse_index('refid') #from global index to local index# + _len = bam.lengths + + counts = OrderedDict({ + 'accepted pairs': 0, + 'map_same_contig pairs': 0, + 'ref_excluded pairs': 0, + 'poor_match pairs': 0, + 'single read':0}) + + bam.reset() + bam_iter = bam.fetch(until_eof=True) + self.index1 = 0 + while True: + self.index1 += 1 + try: + r1 = next_informative(bam_iter, pbar) + while True: + # read records until we get a pair + r2 = next_informative(bam_iter, pbar) + if r1.query_name == r2.query_name: + break + r1 = r2 ###if we don't get a pair, next _bam_iter + counts['single read'] += 1 + + except StopIteration: + break + + if r1.reference_id not in _idx or r2.reference_id not in _idx: + counts['ref_excluded pairs'] += 1 + continue + + if r1.reference_id == r2.reference_id: + counts['map_same_contig pairs'] += 1 + continue + + if not _matcher(r1) or not _matcher(r2): + counts['poor_match pairs'] += 1 + continue + + # get internal indices + ix1 = _idx[r1.reference_id] + ix2 = _idx[r2.reference_id] + + # maintain just a half-matrix + if ix2 < ix1: + ix1, ix2 = ix2, ix1 + + counts['accepted pairs'] += 1 + ix = (ix1 , ix2) + if _seq_map.getitem(ix): + temp_value = _seq_map.getitem(ix) + 1 + _seq_map.setitem(ix , temp_value) + else: + _seq_map.setitem(ix , 1) + + self.seq_map = _seq_map.get_coo() + del _seq_map, r1, r2, _idx + + logger.debug('Pair accounting: {}'.format(counts)) + + + def make_reverse_index(self, field_name): + """ + Make a reverse look-up (dict) from the chosen field in seq_info to the internal index value + of the given sequence. Non-unique fields will raise an exception. + + :param field_name: the seq_info field to use as the reverse. + :return: internal array index of the sequence + """ + rev_idx = {} + for n, seq in enumerate(self.seq_info): + fv = getattr(seq, field_name) + if fv in rev_idx: + raise RuntimeError('field contains non-unique entries, a 1-1 mapping cannot be made') + rev_idx[fv] = n + return rev_idx + + + + def _write_prokaryotic_info(self): + with open(os.path.join(self.path , 'prokaryotic_contig_info.csv'),'w') as out: + for seq in self.prokaryotic_info: + out.write(str(seq.name)+ ',' +str(seq.length)+ ',' +str(seq.GC)) + out.write('\n') + + def _write_viral_info(self): + with open(os.path.join(self.path , 'viral_contig_info.csv'),'w') as out: + for seq in self.viral_info: + out.write(str(seq.name)+ ',' +str(seq.length)+ ',' +str(seq.GC)) + out.write('\n') + + diff --git a/exceptions.py b/exceptions.py new file mode 100644 index 0000000..a2d0e77 --- /dev/null +++ b/exceptions.py @@ -0,0 +1,52 @@ + +class ApplicationException(Exception): + def __init__(self, message): + super(ApplicationException, self).__init__(message) + + +class UnknownEnzymeException(ApplicationException): + """All sequences were excluded during filtering""" + def __init__(self, target, similar): + super(UnknownEnzymeException, self).__init__( + '{} is undefined, but its similar to: {}'.format(target, ', '.join(similar))) + + +class UnknownOrientationStateException(ApplicationException): + """All sequences were excluded during filtering""" + def __init__(self, ori): + super(UnknownOrientationStateException, self).__init__('unknown orientation state [{}].'.format(ori)) + + +class NoneAcceptedException(ApplicationException): + """All sequences were excluded during filtering""" + def __init__(self): + super(NoneAcceptedException, self).__init__('all sequences were excluded') + + +class TooFewException(ApplicationException): + """Method requires a minimum of nodes""" + def __init__(self, minseq, method): + super(TooFewException, self).__init__('More than {} sequences are required to apply {}'.format(minseq, method)) + + +class NoRemainingClustersException(ApplicationException): + def __init__(self, msg): + super(NoRemainingClustersException, self).__init__(msg) + + +class NoReportException(ApplicationException): + """Clustering does not contain a report""" + def __init__(self, clid): + super(NoReportException, self).__init__('No clusters contained an ordering'.format(clid)) + + +class ZeroLengthException(ApplicationException): + """Sequence of zero length""" + def __init__(self, seq_name): + super(ZeroLengthException, self).__init__('Sequence [{}] has zero length'.format(seq_name)) + + +class ParsingError(ApplicationException): + """An error during input parsing""" + def __init__(self, msg): + super(ParsingError, self).__init__(msg) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..0a4aa0e --- /dev/null +++ b/utils.py @@ -0,0 +1,109 @@ +import gzip +import io +import os +import sys +import subprocess + + +def make_dir(path, exist_ok=False): + """ + Convenience method for making directories with a standard logic. + An exception is raised when the specified path exists and is not a directory. + :param path: target path to create + :param exist_ok: if true, an existing directory is ok. Existing files will still cause an exception + """ + if not os.path.exists(path): + os.mkdir(path) + elif not exist_ok: + raise IOError('output directory already exists!') + elif os.path.isfile(path): + raise IOError('output path already exists and is a file!') + + + + +def count_fasta_sequences(file_name): + """ + Estimate the number of fasta sequences in a file by counting headers. Decompression is automatically attempted + for files ending in .gz. Counting and decompression is by why of subprocess calls to grep and gzip. Uncompressed + files are also handled. This is about 8 times faster than parsing a file with BioPython and 6 times faster + than reading all lines in Python. + + :param file_name: the fasta file to inspect + :return: the estimated number of records + """ + if file_name.endswith('.gz'): + proc_uncomp = subprocess.Popen(['gzip', '-cd', file_name], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + proc_read = subprocess.Popen(['grep', r'^>'], stdin=proc_uncomp.stdout, stdout=subprocess.PIPE) + else: + proc_read = subprocess.Popen(['grep', r'^>', file_name], stdout=subprocess.PIPE) + + n = 0 + for _ in proc_read.stdout: + n += 1 + return n + + +def gen_bins(fastafile,resultfile,outputdir): + # read fasta file + sequences={} + if fastafile.endswith("gz"): + with gzip.open(fastafile,'r') as f: + for line in f: + line=str(line,encoding="utf-8") + if line.startswith(">"): + if " " in line: + seq,others=line.split(' ', 1) + sequences[seq] = "" + else : + seq=line.rstrip("\n") + sequences[seq] = "" + else: + sequences[seq] += line.rstrip("\n") + else: + with open(fastafile,'r') as f: + for line in f: + if line.startswith(">"): + if " " in line: + seq,others=line.split(' ', 1) + sequences[seq] = "" + else : + seq=line.rstrip("\n") + sequences[seq] = "" + else: + sequences[seq] += line.rstrip("\n") + dic={} + with open(resultfile,"r") as f: + for line in f: + contig_name,cluster_name=line.strip().split('\t') + try: + dic[cluster_name].append(contig_name) + except: + dic[cluster_name]=[] + dic[cluster_name].append(contig_name) + print("Writing bins in \t{}".format(outputdir)) + if not os.path.exists(outputdir): + os.makedirs(outputdir) + + bin_name=0 + for _,cluster in dic.items(): + if bin_name < 10: + bin = 'VIRAL_BIN'+ '000' + str(bin_name) + '.fa' + elif bin_name >= 10 and bin_name < 100: + bin = 'VIRAL_BIN'+ '00' + str(bin_name) + '.fa' + elif bin_name >= 100 and bin_name < 1000: + bin = 'VIRAL_BIN'+ '0' + str(bin_name) + '.fa' + else: + bin = 'VIRAL_BIN'+str(bin_name) + '.fa' + binfile=os.path.join(outputdir,"{}".format(bin)) + with open(binfile,"w") as f: + for contig_name in cluster: + contig_name=">"+contig_name + try: + sequence=sequences[contig_name] + except: + continue + f.write(contig_name+"\n") + f.write(sequence+"\n") + bin_name+=1 + diff --git a/viralcc.py b/viralcc.py new file mode 100644 index 0000000..9dee0c6 --- /dev/null +++ b/viralcc.py @@ -0,0 +1,163 @@ +#########The structure of the main script is modified from bin3C######## +from construct_graph import ContactMatrix +from bin import ClusterBin +from exceptions import ApplicationException +from utils import make_dir, gen_bins +import logging +import sys +import argparse +import os +import time +import warnings + +##Ignore the warning information of package deprecation## +warnings.filterwarnings("ignore") + +__version__ = '1.0.0, released at 03/2022' + +if __name__ == '__main__': + + def mk_version(): + return 'ViralCC v{}'.format(__version__) + + def out_name(base, suffix): + return '{}{}'.format(base, suffix) + + def ifelse(arg, default): + if arg is None: + return default + else: + return arg + + runtime_defaults = { + 'min_len': 3000, + 'min_mapq': 30, + 'min_match': 30, + 'min_k': 5, + 'random_seed': 42 + } + + global_parser = argparse.ArgumentParser(add_help=False) + global_parser.add_argument('-V', '--version', default=False, action='store_true', help='Show the application version') + global_parser.add_argument('-v', '--verbose', default=False, action='store_true', help='Verbose output') + global_parser.add_argument('--cover', default=False, action='store_true', help='Cover existing files') + global_parser.add_argument('--log', help='Log file path [OUTDIR/viralcc.log]') + + + parser = argparse.ArgumentParser(description='ViralCC: a metagenomic proximity-based tool to retrieve complete viral genomes') + + subparsers = parser.add_subparsers(title='commands', dest='command', description='Valid commands', + help='choose an analysis stage for further options') + + cmd_pl = subparsers.add_parser('pipeline', parents=[global_parser], + description='Normalize contacts and do the binning.') + + cmd_test = subparsers.add_parser('test', parents=[global_parser], + description='pipeline testing.') + + ''' + pipeline subparser input + ''' + cmd_pl.add_argument('--min-len', type=int, + help='Minimum acceptable reference length [3000]') + cmd_pl.add_argument('--min-mapq', type=int, + help='Minimum acceptable mapping quality [30]') + cmd_pl.add_argument('--min-match', type=int, + help='Accepted alignments must being N matches [30]') + cmd_pl.add_argument('--min-k', type=int, + help='Lower bound of k for determining host poximity graph [5]') + cmd_pl.add_argument('--random-seed', type=int, + help='Random seed to run the Leiden algorithm [42]') + cmd_pl.add_argument('FASTA', help='Reference fasta sequence') + cmd_pl.add_argument('BAM', help='Input bam file in query order') + cmd_pl.add_argument('VIRAL', help='Viral contig labels') + cmd_pl.add_argument('OUTDIR', help='Output directory') + + + ''' + Testing of HiCBin software + ''' + cmd_test.add_argument('OUTDIR', help='Output directory of testing results') + + args = parser.parse_args() + + if args.version: + print(mk_version()) + sys.exit(0) + + try: + make_dir(args.OUTDIR, args.cover) + except IOError: + print('Error: cannot find out directory or the directory already exists') + sys.exit(1) + + logging.captureWarnings(True) + logger = logging.getLogger('main') + + # root log listens to everything + root = logging.getLogger('') + root.setLevel(logging.DEBUG) + + # log message format + formatter = logging.Formatter(fmt='%(levelname)-8s | %(asctime)s | %(name)7s | %(message)s') + + # Runtime console listens to INFO by default + ch = logging.StreamHandler() + if args.verbose: + ch.setLevel(logging.DEBUG) + else: + ch.setLevel(logging.INFO) + ch.setFormatter(formatter) + root.addHandler(ch) + + # File log listens to all levels from root + if args.log is not None: + log_path = args.log + else: + log_path = os.path.join(args.OUTDIR, 'hicbin.log') + fh = logging.FileHandler(log_path, mode='a') + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + root.addHandler(fh) + + # Add some environmental details + logger.debug(mk_version()) + logger.debug(sys.version.replace('\n', ' ')) + logger.debug('Command line: {}'.format(' '.join(sys.argv))) + + try: + if args.command == 'pipeline': + start_time = time.time() + cm = ContactMatrix(args.BAM, + args.FASTA, + args.VIRAL, + args.OUTDIR, + min_mapq=ifelse(args.min_mapq, runtime_defaults['min_mapq']), + min_len=ifelse(args.min_len, runtime_defaults['min_len']), + min_match=ifelse(args.min_match, runtime_defaults['min_match']), + min_k=ifelse(args.min_k, runtime_defaults['min_k'])) + + + cl = ClusterBin(args.OUTDIR, + cm.viral_info , + cm.map_combine , + random_seed = ifelse(args.random_seed, runtime_defaults['random_seed'])) + + logger.info('Clustering fininshed') + logger.info('Writing bins...') + gen_bins(args.FASTA , os.path.join(args.OUTDIR ,'cluster_viral_contig.txt') , os.path.join(args.OUTDIR ,'VIRAL_BIN')) + + end_time = time.time() + logger.info('ViralCC consumes {} seconds in total'.format(str(end_time-start_time))) + + + if args.command == 'test': + logger.info('Begin to test the ViralCC software...') + + + + + except ApplicationException: + logger.error('ApplicationException Error') + sys.exit(1) + diff --git a/viralcc_linux_env.yaml b/viralcc_linux_env.yaml new file mode 100644 index 0000000..f781fb2 --- /dev/null +++ b/viralcc_linux_env.yaml @@ -0,0 +1,136 @@ +name: HiCBin_env +channels: + - bioconda + - conda-forge + - defaults + - r +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - biopython=1.78 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.4.13 + - cairo=1.16.0 + - certifi=2020.12.5 + - cffi=1.14.5 + - clang=11.0.1 + - clang-11=11.0.1 + - clangxx=11.0.1 + - compiler-rt=11.0.1 + - compiler-rt_linux-64=11.0.1 + - curl=7.71.1 + - fontconfig=2.13.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - gmp=6.2.1 + - graphite2=1.3.14 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.7.4 + - icu=68.1 + - isl=0.22.1 + - jinja2=2.11.3 + - joblib=1.0.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.17.2 + - ld_impl_linux-64=2.35.1 + - ldid=2.1.2 + - leidenalg=0.8.3 + - libblas=3.9.0 + - libcblas=3.9.0 + - libclang-cpp11=11.0.1 + - libcurl=7.71.1 + - libcxx=11.0.1 + - libcxxabi=11.0.1 + - libdeflate=1.0 + - libedit=3.1.20191231 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran=3.0.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.66.7 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libllvm11=11.0.1 + - libnghttp2=1.43.0 + - libopenblas=0.3.12 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.2.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.10 + - llvm-openmp=11.0.1 + - llvm-tools=11.0.1 + - lz4-c=1.9.3 + - make=4.3 + - markupsafe=1.1.1 + - mpc=1.1.0 + - mpfr=4.0.2 + - ncurses=6.2 + - numpy=1.20.1 + - openssl=1.1.1k + - pandas=1.2.2 + - pango=1.42.4 + - pcre=8.44 + - pcre2=10.36 + - pip=21.0.1 + - pixman=0.40.0 + - psutil=5.8.0 + - pycairo=1.20.0 + - pycparser=2.20 + - pysam=0.15.3 + - python=3.7.10 + - python-dateutil=2.8.1 + - python-igraph=0.8.3 + - python_abi=3.7 + - pytz=2021.1 + - readline=8.1 + - scikit-learn=0.24.1 + - scipy=1.6.0 + - sed=4.8 + - setuptools=52.0.0 + - simplegeneric=0.8.1 + - six=1.15.0 + - sqlite=3.34.0 + - sysroot_linux-64=2.12 + - tapi=1100.0.11 + - texttable=1.6.3 + - threadpoolctl=2.1.0 + - tk=8.6.10 + - tktable=2.10 + - tqdm=4.58.0 + - tzlocal=2.1 + - wheel=0.36.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.0 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.8 diff --git a/viralcc_osx_env.yaml b/viralcc_osx_env.yaml new file mode 100644 index 0000000..0cf8395 --- /dev/null +++ b/viralcc_osx_env.yaml @@ -0,0 +1,107 @@ +name: HiCBin_env +channels: + - conda-forge + - bioconda + - defaults + - r +dependencies: + - _r-mutex=1.0.1 + - biopython=1.78 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.1.19 + - cairo=1.16.0 + - cctools_osx-64=949.0.1 + - certifi=2020.12.5 + - cffi=1.14.5 + - clang=11.0.1 + - clang-11=11.0.1 + - clang_osx-64=11.0.1 + - clangxx=11.0.1 + - clangxx_osx-64=11.0.1 + - compiler-rt=11.0.1 + - compiler-rt_osx-64=11.0.1 + - curl=7.71.1 + - fontconfig=2.13.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gettext=0.19.8.1 + - gfortran_impl_osx-64=9.3.0 + - gfortran_osx-64=9.3.0 + - gmp=6.2.1 + - graphite2=1.3.14 + - gsl=2.6 + - harfbuzz=2.7.4 + - icu=68.1 + - isl=0.22.1 + - jinja2=2.11.3 + - joblib=1.0.1 + - jpeg=9d + - krb5=1.17.2 + - ld64_osx-64=530 + - ldid=2.1.2 + - leidenalg=0.8.3 + - libblas=3.9.0 + - libcblas=3.9.0 + - libclang-cpp11=11.0.1 + - libcurl=7.71.1 + - libcxx=11.0.1 + - libdeflate=1.0 + - libedit=3.1.20191231 + - libev=4.33 + - libffi=3.3 + - libglib=2.66.7 + - libiconv=1.16 + - liblapack=3.9.0 + - libllvm11=11.0.1 + - libnghttp2=1.43.0 + - libopenblas=0.3.12 + - libpng=1.6.37 + - libssh2=1.9.0 + - libtiff=4.2.0 + - libwebp-base=1.2.0 + - libxml2=2.9.10 + - llvm-openmp=11.0.1 + - llvm-tools=11.0.1 + - lz4-c=1.9.3 + - make=4.3 + - markupsafe=1.1.1 + - mpc=1.1.0 + - mpfr=4.0.2 + - ncurses=6.2 + - numpy=1.20.1 + - openssl=1.1.1j + - pandas=1.2.2 + - pango=1.42.4 + - pcre=8.44 + - pcre2=10.36 + - pip=21.0.1 + - pixman=0.40.0 + - psutil=5.8.0 + - pycairo=1.20.0 + - pycparser=2.20 + - pysam=0.15.3 + - python=3.7.10 + - python-dateutil=2.8.1 + - python-igraph=0.8.3 + - python_abi=3.7 + - pytz=2021.1 + - readline=8.1 + - scikit-learn=0.24.1 + - scipy=1.6.0 + - setuptools=52.0.0 + - simplegeneric=0.8.1 + - six=1.15.0 + - sqlite=3.34.0 + - tapi=1100.0.11 + - texttable=1.6.3 + - threadpoolctl=2.1.0 + - tk=8.6.10 + - tktable=2.10 + - tqdm=4.58.0 + - tzlocal=2.1 + - wheel=0.36.2 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.8