Skip to content

Commit

Permalink
Merge pull request #43 from martinghunt/spades_3.6.2
Browse files Browse the repository at this point in the history
Spades 3.6.2
  • Loading branch information
John Tate committed Nov 30, 2015
2 parents edd11fb + 37e1bb3 commit 9d517da
Show file tree
Hide file tree
Showing 30 changed files with 12,076 additions and 54 deletions.
1 change: 1 addition & 0 deletions circlator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
__all__ = [
'assemble',
'assembly',
'bamfilter',
'clean',
'common',
Expand Down
153 changes: 153 additions & 0 deletions circlator/assembly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import os
import pyfastaq

class Error (Exception): pass

class Assembly:
def __init__(self, path):
'''path can be a directory or a filename. If directory, assumes the name of a SPAdes
output directory. If a file, assumes it is a fasta file of contigs'''
if not os.path.exists(path):
raise Error('Input path to Assembly.__init__ not found: ' + path)
elif os.path.isdir(path):
self.spades_dir = os.path.abspath(path)
else:
self.contigs_fasta = os.path.abspath(path)
self.spades_dir = None

self._set_filenames()


@staticmethod
def _file_exists(filename):
if os.path.exists(filename):
return filename
else:
return None


def _set_filenames(self):
self.contigs_fastg = None
self.contigs_paths = None
self.assembly_graph_fastg = None

if self.spades_dir is None:
return

contigs_fasta = os.path.join(self.spades_dir, 'contigs.fasta')
self.contigs_fasta = self._file_exists(os.path.join(contigs_fasta))

if self.contigs_fasta is None:
raise Error('Error finding SPAdes contigs file: ' + contigs_fasta)

self.contigs_fastg = self._file_exists(os.path.join(self.spades_dir, 'contigs.fastg'))
self.contigs_paths = self._file_exists(os.path.join(self.spades_dir, 'contigs.paths'))
self.assembly_graph_fastg = self._file_exists(os.path.join(self.spades_dir, 'assembly_graph.fastg'))

if None == self.contigs_fastg == self.contigs_paths == self.assembly_graph_fastg or \
( self.contigs_fastg is None and (None in {self.contigs_paths, self.assembly_graph_fastg}) ) or \
( self.contigs_fastg is not None and (self.contigs_paths is not None or self.assembly_graph_fastg is not None) ):
error_message = '\n'.join([
'Error finding SPAdes graph files in the directory ' + self.spades_dir,
'Expected either:',
' contigs.fastg (SPAdes <3.6.1)',
'or:',
' contigs.paths and assembly_graph.fastg (SPAdes >3.6.1)'
])
raise Error(error_message)


def get_contigs(self):
'''Returns a dictionary of contig_name -> pyfastaq.Sequences.Fasta object'''
contigs = {}
pyfastaq.tasks.file_to_dict(self.contigs_fasta, contigs)
return contigs


@classmethod
def _circular_contigs_from_spades_before_3_6_1(cls, fastg_file):
seq_reader = pyfastaq.sequences.file_reader(fastg_file)
names = set([x.id.rstrip(';') for x in seq_reader if ':' in x.id])
found_fwd = set()
found_rev = set()
for name in names:
l = name.split(':')
if len(l) != 2:
continue
if l[0] == l[1]:
if l[0][-1] == "'":
found_rev.add(l[0][:-1])
else:
found_fwd.add(l[0])

return found_fwd.intersection(found_rev)


@classmethod
def _spades_contigs_paths_to_dict(cls, filename):
d = {}
node = None

with open(filename) as f:
for line in f:
if node is None:
if not line.startswith('NODE_'):
raise Error('Error loading info from SPAdes contigs path file ' + filename)
node = line.rstrip()
else:
if line.startswith('NODE_') or node in d:
raise Error('Error loading info from SPAdes contigs path file ' + filename)
d[node] = line.rstrip()
node = None

return d


@classmethod
def _circular_edges_to_edge_numbers_dict(cls, circular_edges):
'''Edge names are like this: EDGE_1_length_44766_cov_16.555.
This returns dict of edge number -> edge name.
eg 1 -> EDGE_1_length_44766_cov_16.555.'''
return {x.split('_')[1]: x for x in circular_edges}


@staticmethod
def _circular_contigs_from_spades_after_3_6_1(assembly_graph_fastg, contigs_paths):
circular_graph_edges = Assembly._circular_contigs_from_spades_before_3_6_1(assembly_graph_fastg)
circular_edge_dict = Assembly._circular_edges_to_edge_numbers_dict(circular_graph_edges)
paths_dict = Assembly._spades_contigs_paths_to_dict(contigs_paths)
circular_nodes = set()

for node in paths_dict:
if node.endswith("'"):
continue

edges = paths_dict[node].split(',')
rev_node = node + "'"

if len(edges) != 1 or rev_node not in paths_dict:
continue

rev_edges = paths_dict[rev_node].split(',')

if len(rev_edges) != 1:
continue

edge = list(edges)[0][:-1]
edge_strand = list(edges)[0][-1]
rev_edge = list(rev_edges)[0][:-1]
rev_edge_strand = list(rev_edges)[0][-1]
if {'-', '+'} == {edge_strand, rev_edge_strand} and edge == rev_edge and edge in circular_edge_dict:
circular_nodes.add(node)

return circular_nodes


def circular_contigs(self):
'''Returns a set of the contig names that are circular'''
if self.contigs_fastg is not None:
return self._circular_contigs_from_spades_before_3_6_1(self.contigs_fastg)
elif None not in [self.contigs_paths, self.assembly_graph_fastg]:
return self._circular_contigs_from_spades_after_3_6_1(self.assembly_graph_fastg, self.contigs_paths)
else:
return set()
2 changes: 1 addition & 1 deletion circlator/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class Error (Exception): pass

version = '1.1.1'
version = '1.1.2'

def syscall(cmd, allow_fail=False, verbose=False):
if verbose:
Expand Down
16 changes: 5 additions & 11 deletions circlator/external_progs.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ def exe(self):
}


max_versions = {
'spades': '3.6.0'
bad_versions = {
'spades': '3.6.1'
}


prog_name_to_default = {
'bwa': 'bwa',
'nucmer': 'nucmer',
Expand Down Expand Up @@ -88,17 +89,10 @@ def make_and_check_prog(name, verbose=False, raise_error=True, filehandle=None):
handle_error('Version of ' + name + ' too low. I found ' + p.version() + ', but must be at least ' + min_versions[name] + '. Found here:\n' + p.full_path, raise_error=raise_error)
return p

if name != 'spades' and name in max_versions and not p.version_at_most(max_versions[name]):
handle_error('Version of ' + name + ' too high. I found ' + p.version() + ', but must be at most ' + max_versions[name] + '. Found here:\n' + p.full_path, raise_error=raise_error)
if name == 'spades' and p.version() == bad_versions['spades']:
handle_error('ERROR! SPAdes version ' + bad_versions['spades'] + ' is incompatible with Circlator. Please update SPAdes to the latest version', raise_error=raise_error)
return p

if name == 'spades' and not p.version_at_most(max_versions['spades']):
print('WARNING: SPAdes version', p.version(), 'found, which is newer than 3.6.0.', file=sys.stderr)
print(' Circlator can use this version, but some circularizations may be missed.', file=sys.stderr)
print(' We recommend that you use version 3.6.0, which is available for Linux using:', file=sys.stderr)
print(' wget http://spades.bioinf.spbau.ru/release3.6.0/SPAdes-3.6.0-Linux.tar.gz', file=sys.stderr)


if verbose:
print('Using', name, 'version', p.version(), 'as', p.full_path)

Expand Down
55 changes: 14 additions & 41 deletions circlator/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,11 @@ def __init__(
threads=1,
log_prefix='merge',
):
for f in [original_assembly, reassembly]:
if not os.path.exists(f):
raise Error('File not found:' + f)
if not os.path.exists(original_assembly):
raise Error('File not found:' + original_assembly)

self.original_fasta = original_assembly
self.reassembly_fasta = reassembly
self.reassembly = circlator.assembly.Assembly(reassembly)
self.reads = reads
self.outprefix = outprefix
self.nucmer_diagdiff = nucmer_diagdiff
Expand All @@ -53,9 +52,8 @@ def __init__(
self.log_prefix = log_prefix
self.merges = []
self.original_contigs = {}
self.reassembly_contigs = {}
self.reassembly_contigs = self.reassembly.get_contigs()
pyfastaq.tasks.file_to_dict(self.original_fasta, self.original_contigs)
pyfastaq.tasks.file_to_dict(self.reassembly_fasta, self.reassembly_contigs)


def _run_nucmer(self, ref, qry, outfile):
Expand Down Expand Up @@ -338,23 +336,7 @@ def _circularise_contigs(self, nucmer_hits):
to_circularise_with_nucmer = self._get_possible_circular_ref_contigs(long_nucmer_hits, log_fh=log_fh, log_outprefix=log_outprefix)
to_circularise_with_nucmer = self._remove_keys_from_dict_with_nonunique_values(to_circularise_with_nucmer, log_fh=log_fh, log_outprefix=log_outprefix)
used_spades_contigs = set()
reassembly_fastg = self.reassembly_fasta[:-1] + 'g'

if os.path.exists(reassembly_fastg):
called_as_circular_by_spades = self._get_spades_circular_nodes(reassembly_fastg)
else:
called_as_circular_by_spades = set()
warning_lines = [
'WARNING: FASTG reassembly file ' + reassembly_fastg + ' not found. If the reassembly was not made with SPAdes, this is normal (but you miss out on the extra information that SPAdes can output).',
'WARNING: ... If the reassembly was made with SPAdes, then please consider using version 3.6.0 of SPAdes, which does make this file.',
'WARNING: ... SPAdes 3.6.0 can be obtained for Linux like this:',
'WARNING: ... wget http://spades.bioinf.spbau.ru/release3.6.0/SPAdes-3.6.0-Linux.tar.gz',
]
for line in warning_lines:
print(log_outprefix, line, sep='\t', file=log_fh)
print(log_outprefix, line, sep='\t', file=sys.stderr)

print(log_outprefix, 'WARNING: ... this message has also been written to the circularise_details.log file.', sep='\t', file=sys.stderr)
called_as_circular_by_spades = self.reassembly.circular_contigs()

if len(called_as_circular_by_spades):
circular_string = ','.join(sorted(called_as_circular_by_spades))
Expand Down Expand Up @@ -667,12 +649,11 @@ def _write_act_files(self, ref_fasta, qry_fasta, coords_file, outprefix):
def _iterative_bridged_contig_pair_merge(self, outprefix):
'''Iteratively merges contig pairs using bridging contigs from reassembly, until no more can be merged'''
if self.reads is None:
return self.original_fasta, self.reassembly_fasta, None, None
return self.original_fasta, self.reassembly, None, None

log_file = outprefix + '.iterations.log'
log_fh = pyfastaq.utils.open_file_write(log_file)
genome_fasta = self.original_fasta
reassembly_fasta = self.reassembly_fasta
nucmer_coords = outprefix + '.iter.1.coords'
reads_to_map = self.reads
act_prefix = None
Expand All @@ -682,10 +663,10 @@ def _iterative_bridged_contig_pair_merge(self, outprefix):
while made_a_join:
this_log_prefix = '[' + self.log_prefix + ' iterative_merge ' + str(iteration) + ']'
print(this_log_prefix, '\tUsing nucmer matches from ', nucmer_coords, sep='', file=log_fh)
self._run_nucmer(genome_fasta, reassembly_fasta, nucmer_coords)
self._run_nucmer(genome_fasta, self.reassembly.contigs_fasta, nucmer_coords)
act_prefix = outprefix + '.iter.' + str(iteration)
print(this_log_prefix, '\tYou can view the nucmer matches with ACT using: ./', act_prefix, '.start_act.sh', sep='', file=log_fh)
self._write_act_files(genome_fasta, reassembly_fasta, nucmer_coords, act_prefix)
self._write_act_files(genome_fasta, self.reassembly.contigs_fasta, nucmer_coords, act_prefix)
nucmer_hits_by_ref = self._load_nucmer_hits(nucmer_coords)
made_a_join = self._merge_all_bridged_contigs(nucmer_hits_by_ref, self.original_contigs, self.reassembly_contigs, log_fh, this_log_prefix)
iteration += 1
Expand All @@ -694,10 +675,7 @@ def _iterative_bridged_contig_pair_merge(self, outprefix):
print(this_log_prefix, '\tMade at least one merge. Remapping reads and reassembling',sep='', file=log_fh)
nucmer_coords = outprefix + '.iter.' + str(iteration) + '.coords'
genome_fasta = outprefix + '.iter.' + str(iteration) + '.merged.fasta'
reassembly_fasta = outprefix + '.iter.' + str(iteration) + '.reassembly.fasta'
reassembly_fastg = outprefix + '.iter.' + str(iteration) + '.reassembly.fastg'
self._contigs_dict_to_file(self.original_contigs, genome_fasta)
self._contigs_dict_to_file(self.reassembly_contigs, reassembly_fasta)
bam = outprefix + '.iter.' + str(iteration) + '.bam'

circlator.mapping.bwa_mem(
Expand All @@ -722,18 +700,13 @@ def _iterative_bridged_contig_pair_merge(self, outprefix):
spades_use_first_success=self.spades_use_first_success,
)
a.run()
os.rename(os.path.join(assembler_dir, 'contigs.fasta'), reassembly_fasta)
contigs_fastg = os.path.join(assembler_dir, 'contigs.fastg')
if os.path.exists(contigs_fastg):
os.rename(contigs_fastg, reassembly_fastg)

shutil.rmtree(assembler_dir)
pyfastaq.tasks.file_to_dict(reassembly_fasta, self.reassembly_contigs)
self.reassembly = circlator.assembly.Assembly(assembler_dir)
self.reassembly_contigs = self.reassembly.get_contigs()
elif iteration <= 2:
print(this_log_prefix, '\tNo contig merges were made',sep='', file=log_fh)

pyfastaq.utils.close(log_fh)
return genome_fasta, reassembly_fasta, nucmer_coords, act_prefix + '.start_act.sh'
return genome_fasta, nucmer_coords, act_prefix + '.start_act.sh'


def _contigs_dict_to_file(self, contigs, fname):
Expand Down Expand Up @@ -817,13 +790,13 @@ def _write_merge_log(self, filename):

def run(self):
out_log = os.path.abspath(self.outprefix + '.merge.log')
self.original_fasta, self.reassembly_fasta, nucmer_coords_file, act_script = self._iterative_bridged_contig_pair_merge(self.outprefix + '.merge')
self.original_fasta, nucmer_coords_file, act_script = self._iterative_bridged_contig_pair_merge(self.outprefix + '.merge')
self._write_merge_log(self.outprefix + '.merge.log')
nucmer_circularise_coords = os.path.abspath(self.outprefix + '.circularise.coords')

if nucmer_coords_file is None:
self._run_nucmer(self.original_fasta, self.reassembly_fasta, nucmer_circularise_coords)
self._write_act_files(self.original_fasta, self.reassembly_fasta, nucmer_circularise_coords, self.outprefix + '.circularise')
self._run_nucmer(self.original_fasta, self.reassembly.contigs_fasta, nucmer_circularise_coords)
self._write_act_files(self.original_fasta, self.reassembly.contigs_fasta, nucmer_circularise_coords, self.outprefix + '.circularise')
else:
os.symlink(nucmer_coords_file, nucmer_circularise_coords)
os.symlink(act_script, self.outprefix + '.circularise.start_act.sh')
Expand Down
Loading

0 comments on commit 9d517da

Please sign in to comment.