From 9eb00395bf5b8aeffb10fa0cbb2666b1652e624e Mon Sep 17 00:00:00 2001 From: BuysDB Date: Thu, 28 Jan 2021 14:22:11 +0100 Subject: [PATCH 01/17] Add ligation demux --- .../demultiplexModules/scCHIC.py | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py b/singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py index a3ecb41b..06edbcdb 100644 --- a/singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py +++ b/singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py @@ -69,6 +69,71 @@ def demultiplex(self, records, **kwargs): #taggedRecords[0].qualities = taggedRecords[0].qualities[1:] return taggedRecords +class SCCHIC_384w_c8_u3_direct_ligation(UmiBarcodeDemuxMethod): + def __init__(self, barcodeFileParser, random_primer_read=None,random_primer_length=None, **kwargs): + self.barcodeFileAlias = 'maya_384NLA' + UmiBarcodeDemuxMethod.__init__( + self, + umiRead=0, + umiStart=0, + umiLength=3, + barcodeRead=0, + barcodeStart=3, + barcodeLength=8, + random_primer_read=random_primer_read, + random_primer_length=random_primer_length, + barcodeFileAlias=self.barcodeFileAlias, + barcodeFileParser=barcodeFileParser, + **kwargs) + self.shortName = 'scCHIC384C8U3l' + self.longName = 'Single cell CHIC, 384well CB: 8bp UMI: 3bp, no RP' + self.autoDetectable = False + self.description = '384 well format. 3bp umi followed by 8bp barcode and a single A. R2 does not contain a random primer' + + self.sequenceCapture[0] = slice( + self.barcodeLength + self.umiLength + 1, + None) # dont capture the first base + + def demultiplex(self, records, **kwargs): + + if kwargs.get( + 'probe') and records[0].sequence[self.barcodeLength + self.umiLength] != 'T': + raise NonMultiplexable + + + # add first 2 bases as ligation tag: + ligation_start = self.barcodeLength + self.umiLength + ligation_end = ligation_start + 2 + ligation_sequence = records[0].sequence[ligation_start:ligation_end] + ligation_qualities = records[0].qual[ligation_start:ligation_end] + + taggedRecords = UmiBarcodeDemuxMethod.demultiplex( + self, records, **kwargs) + + taggedRecords[0].addTagByTag( + 'lh', + ligation_sequence, + isPhred=False, + make_safe=False) + taggedRecords[0].addTagByTag( + 'lq', + ligation_qualities, + isPhred=True, + make_safe=False) + taggedRecords[1].addTagByTag( + 'lh', + ligation_sequence, + isPhred=False, + make_safe=False) + taggedRecords[1].addTagByTag( + 'lq', + ligation_qualities, + isPhred=True, + make_safe=False) + #taggedRecords[0].sequence = taggedRecords[0].sequence[1:] + #taggedRecords[0].qualities = taggedRecords[0].qualities[1:] + return taggedRecords + From e21e6e190d47e1ea9c14efb454dd411a4af95bcb Mon Sep 17 00:00:00 2001 From: BuysDB Date: Thu, 28 Jan 2021 14:22:27 +0100 Subject: [PATCH 02/17] Load the new demux module --- .../modularDemultiplexer/demultiplexingStrategyLoader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/singlecellmultiomics/modularDemultiplexer/demultiplexingStrategyLoader.py b/singlecellmultiomics/modularDemultiplexer/demultiplexingStrategyLoader.py index 72cf4c62..72cecdc8 100644 --- a/singlecellmultiomics/modularDemultiplexer/demultiplexingStrategyLoader.py +++ b/singlecellmultiomics/modularDemultiplexer/demultiplexingStrategyLoader.py @@ -61,6 +61,8 @@ def __init__( dm.SCCHIC_384w_c8_u3, dm.SCCHIC_384w_c8_u3_cs2, dm.SCCHIC_384w_c8_u3_pdt, + dm.SCCHIC_384w_c8_u3_direct_ligation, + dm.MSPJI_c8_u3, dm.ScartraceR2, dm.ScartraceR1, From 250d6f4b42b771cd553fa774466a8dc15e2e6cd9 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Thu, 28 Jan 2021 14:22:49 +0100 Subject: [PATCH 03/17] Add callback function for the pysam writer --- singlecellmultiomics/molecule/molecule.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/singlecellmultiomics/molecule/molecule.py b/singlecellmultiomics/molecule/molecule.py index 478cb36b..babea7cf 100644 --- a/singlecellmultiomics/molecule/molecule.py +++ b/singlecellmultiomics/molecule/molecule.py @@ -1824,17 +1824,24 @@ def get_mean_rt_fragment_size(self): self.get_rt_reaction_fragment_sizes() ) - def write_pysam(self, target_file, consensus=False, no_source_reads=False, consensus_name=None): + def write_pysam(self, target_file, consensus=False, no_source_reads=False, consensus_name=None, consensus_read_callback=None, consensus_read_callback_kwargs=None): """Write all associated reads to the target file Args: target_file (pysam.AlignmentFile) : Target file consensus (bool) : write consensus no_source_reads (bool) : while in consensus mode, don't write original reads + consensus_read_callback (function) : this function is called with every consensus read as an arguments + consensus_read_callback_kwargs (dict) : arguments to pass to the callback function """ if consensus: reads = self.deduplicate_majority(target_file, f'molecule_{uuid4()}' if consensus_name is None else consensus_name) + if consensus_read_callback is not None: + if consensus_read_callback_kwargs is not None: + consensus_read_callback(reads, **consensus_read_callback_kwargs) + else: + consensus_read_callback(reads) for read in reads: target_file.write(read) From 702feb3f5636deb00b6d086a184b43341550a6c6 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Thu, 28 Jan 2021 14:23:02 +0100 Subject: [PATCH 04/17] Added complete match property --- singlecellmultiomics/molecule/molecule.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/singlecellmultiomics/molecule/molecule.py b/singlecellmultiomics/molecule/molecule.py index babea7cf..3b2cd6be 100644 --- a/singlecellmultiomics/molecule/molecule.py +++ b/singlecellmultiomics/molecule/molecule.py @@ -1672,6 +1672,22 @@ def aligned_length(self) -> int: else: return None + @property + def is_completely_matching(self) -> bool: + """ + Checks if all associated reads are completely mapped: + checks if all cigar operations are M, + Returns True when all cigar operations are M, False otherwise + """ + + return all( + ( + all( + [ (operation==0) + for operation, amount in read.cigartuples] ) + for read in self.iter_reads())) + + @property def estimated_max_length(self) -> int: """ From 24cf4f15dc61f802497d029a9b1dcaa5405ef387 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Thu, 28 Jan 2021 15:53:25 +0100 Subject: [PATCH 05/17] Version bump --- singlecellmultiomics/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecellmultiomics/version.py b/singlecellmultiomics/version.py index b04cffbb..60eb1afd 100644 --- a/singlecellmultiomics/version.py +++ b/singlecellmultiomics/version.py @@ -1 +1 @@ -__version__ = '0.1.17' +__version__ = '0.1.18' From 6bdc7ffe6fc5069393cbea75e0dd737f3ad39d8f Mon Sep 17 00:00:00 2001 From: BuysDB Date: Fri, 29 Jan 2021 11:04:37 +0100 Subject: [PATCH 06/17] Added test cases for random primer autodetect --- tests/test_fragment.py | 111 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/tests/test_fragment.py b/tests/test_fragment.py index f4677eb5..9ce9e0b2 100644 --- a/tests/test_fragment.py +++ b/tests/test_fragment.py @@ -5,11 +5,120 @@ import singlecellmultiomics.fragment import pysam import pysamiterators.iterators +import os +from singlecellmultiomics.fragment import Fragment, CHICFragment +from singlecellmultiomics.utils import create_MD_tag """ These tests check if the Molecule module is working correctly """ +class TestTAPs(unittest.TestCase): -class TestFragment(unittest.TestCase): + + def test_random_priming(self): + temp_folder = 'data' + + enable_ref_write=False + ref_path = f'{temp_folder}/chic_ref.fa' + alignments_path = f'{temp_folder}/chic_test_alignments.bam' + + if not os.path.exists(temp_folder): + os.makedirs(temp_folder) + # Create reference bam file + + refseq = 'TTAATCATGAAACCGTGGAGGCAAATCGGAGTGTAAGGCTTGACTGGATTCCTACGTTGCGTAGGTTCATGGGGGG' + if enable_ref_write: + with open(ref_path, 'w') as f: + f.write(f">chr1\n{refseq}\n>chr2\n{complement(refseq)}\n""") + + # This command needs to finish, which is not working properly during testing + pysam.faidx(ref_path) + + # CATG at base 5 + # Create BAM file with NLA fragment: + + def get_reads(): + with pysam.AlignmentFile(alignments_path_unsorted,'wb',reference_names=['chr1'],reference_lengths=[len(refseq)]) as bam: + + + ### Nla III mate pair example, containing 2 CpGs and 1 call on the wrong strand + read_A = pysam.AlignedSegment(bam.header) + read_A.reference_name = 'chr1' + read_A.reference_start = 5 + # Before last A is a bogus G>A conversion to test strandness: + read_A.query_sequence = 'CATGAAACCGTGGAGGCAAATTGGAGTAT' + read_A.cigarstring = f'{len(read_A.query_sequence)}M' + read_A.qual = 'A'*len(read_A.query_sequence) + read_A.mapping_quality = 60 + read_A.query_name = 'EX1_GA_CONV_2x_CpG_TAPS' + read_A.set_tag('SM', 'Cell_A') + read_A.is_read1 = True + read_A.is_read2 = False + read_A.set_tag('lh','TG') + # Set substitution tag: + read_A.set_tag('MD', + create_MD_tag( + refseq[read_A.reference_start:read_A.reference_end], read_A.query_sequence)) + read_A.is_paired = True + read_A.is_proper_pair = True + + # Create a second read which is a mate of the previous + read_B = pysam.AlignedSegment(bam.header) + read_B.reference_name = 'chr1' + read_B.reference_start = 25 + read_B.query_sequence = refseq[25:60].replace('TGT','TAT').replace('CG', 'TG') + read_B.cigarstring = f'{len(read_B.query_sequence)}M' + read_B.qual = 'A'*len(read_B.query_sequence) + read_B.mapping_quality = 60 + read_B.is_read2 = True + read_B.is_read1 = False + read_B.is_reverse = True + read_B.query_name = 'EX1_GA_CONV_2x_CpG_TAPS' + read_B.set_tag('SM', 'Cell_A') + read_B.set_tag('lh','TG') + read_B.set_tag('MD', + create_MD_tag(refseq[read_B.reference_start:read_B.reference_end], + read_B.query_sequence, + )) + read_B.is_paired = True + read_B.is_proper_pair = True + + read_A.next_reference_id = read_B.reference_id + read_A.next_reference_start = read_B.reference_start + read_B.next_reference_id = read_A.reference_id + read_B.next_reference_start = read_A.reference_start + + read_A.mate_is_reverse = read_B.is_reverse + read_B.mate_is_reverse = read_A.is_reverse + + return read_A, read_B + + + + alignments_path_unsorted = f'{alignments_path}.unsorted.bam' + + read_A, read_B = get_reads() + read_B.set_tag('rS','AATTAA') # Set random primer sequence + ## Act on reads with random primer set: + frag = Fragment([read_A, read_B]) + self.assertTrue(frag.R2_primer_length==0) + self.assertTrue(frag.unsafe_trimmed) + + ## Act on reads without random primer + read_A, read_B = get_reads() + frag = Fragment([read_A, read_B], R2_primer_length=10) + self.assertTrue(frag.R2_primer_length==10) + self.assertFalse(frag.unsafe_trimmed) + + read_A, read_B = get_reads() + frag = CHICFragment([read_A, read_B], R2_primer_length=0) + self.assertTrue(frag.R2_primer_length==0) + self.assertFalse(frag.unsafe_trimmed) + + read_A, read_B = get_reads() + read_B.set_tag('rS','AATTAA') # Set random primer sequence + frag = CHICFragment([read_A, read_B], R2_primer_length=0) + self.assertTrue(frag.R2_primer_length==0) + self.assertTrue(frag.unsafe_trimmed) def test_init(self): """Test if the fragment can be initialised""" From f58b73e76e60b1ced4f24b0de0c12467c12e6585 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Fri, 29 Jan 2021 11:04:49 +0100 Subject: [PATCH 07/17] Added some helper methods --- singlecellmultiomics/utils/sequtils.py | 28 ++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/singlecellmultiomics/utils/sequtils.py b/singlecellmultiomics/utils/sequtils.py index 36e15405..c7a10e32 100644 --- a/singlecellmultiomics/utils/sequtils.py +++ b/singlecellmultiomics/utils/sequtils.py @@ -25,8 +25,32 @@ def prefetch(self, contig, start, end): +def get_chromosome_number(chrom: str) -> int: + """ + Get chromosome number (index) of the supplied chromosome: + '1' -> 1, chr1 -> 1, returns -1 when not available, chrM -> -1 + """ + try: + return int(chrom.replace('chr','')) + except Exception as e: + return -1 + +def is_autosome(chrom: str) -> bool: + """ Returns True when the chromsome is an autosomal chromsome, + not an alternative allele, mitochrondrial or sex chromosome + + Args: + chrom(str) : chromosome name + + Returns: + is_main(bool) : True when the chromsome is an autosome + + """ + return is_main_chromosome(chrom) and get_chromosome_number(chrom)!=-1 + + -def is_main_chromosome(chrom): +def is_main_chromosome(chrom: str) -> bool: """ Returns True when the chromsome is a main chromsome, not an alternative or other @@ -42,7 +66,7 @@ def is_main_chromosome(chrom): return False return True -def get_contig_list_from_fasta(fasta_path, with_length=False): +def get_contig_list_from_fasta(fasta_path: str, with_length: bool=False) -> list: """Obtain list of contigs froma fasta file, all alternative contigs are pooled into the string MISC_ALT_CONTIGS_SCMO From 6006ee0e4ca1c2f00945bdd1118efbf7f0531dc5 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Fri, 29 Jan 2021 11:05:20 +0100 Subject: [PATCH 08/17] Rp autodetect --- singlecellmultiomics/fragment/chic.py | 11 ++++++++++- singlecellmultiomics/fragment/fragment.py | 7 ++++++- singlecellmultiomics/molecule/molecule.py | 3 ++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/singlecellmultiomics/fragment/chic.py b/singlecellmultiomics/fragment/chic.py index ad9593e4..e4ea4b98 100644 --- a/singlecellmultiomics/fragment/chic.py +++ b/singlecellmultiomics/fragment/chic.py @@ -6,7 +6,7 @@ class CHICFragment(Fragment): def __init__(self, reads, R1_primer_length=4, - R2_primer_length=6, + R2_primer_length=0, # Is automatically detected now assignment_radius=0, umi_hamming_distance=1, invert_strand=False, @@ -14,6 +14,15 @@ def __init__(self, **kwargs ): self.invert_strand = invert_strand + + # Perform random primer autodect, + # When the demux profile is MX:Z:scCHIC384C8U3l, then there is no random primer + for read in reads: + if read is not None and read.has_tag('MX'): + if read.get_tag('MX')[-1]=='l': + R2_primer_length = 0 + break + Fragment.__init__(self, reads, assignment_radius=assignment_radius, diff --git a/singlecellmultiomics/fragment/fragment.py b/singlecellmultiomics/fragment/fragment.py index f4e6ba55..d36048a2 100644 --- a/singlecellmultiomics/fragment/fragment.py +++ b/singlecellmultiomics/fragment/fragment.py @@ -74,7 +74,8 @@ def __init__(self, reads, length of R1 primer, these bases are not taken into account when calculating a consensus R2_primer_length(int): - length of R2 primer, these bases are not taken into account when calculating a consensus + length of R2 primer, these bases are not taken into account when calculating a consensus, this length is auto-detected/ overwritten when the rS tag is set + tag_definitions(list): sam TagDefinitions @@ -405,6 +406,10 @@ def get_random_primer_hash(self): if R2 is None or R2.query_sequence is None: return None, None, None + + if self.R2_primer_length == 0 and self.random_primer_sequence is None: + return None, None, None + # The read was not mapped if R2.is_unmapped: # Guess the orientation does not matter diff --git a/singlecellmultiomics/molecule/molecule.py b/singlecellmultiomics/molecule/molecule.py index 3b2cd6be..91969aa5 100644 --- a/singlecellmultiomics/molecule/molecule.py +++ b/singlecellmultiomics/molecule/molecule.py @@ -547,6 +547,7 @@ def write_tags(self): read.is_duplicate = True # Write RT reaction tags (rt: rt reaction index, rd rt duplicate index) + # This is only required for fragments which have defined random primers rt_reaction_index = None for rt_reaction_index, ( (contig, random_primer_start, random_primer_sequence), frags) in enumerate( self.get_rt_reactions().items()): @@ -1793,7 +1794,7 @@ def can_be_yielded(self, chromosome, position): self.cache_size * 0.5) - def get_rt_reactions(self): + def get_rt_reactions(self) -> dict: """Obtain RT reaction dictionary returns: From 78cf1321502623b725db299a9648ba2de29337b5 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Fri, 29 Jan 2021 11:11:07 +0100 Subject: [PATCH 09/17] Final fixes for autodetect --- singlecellmultiomics/fragment/chic.py | 2 +- tests/test_fragment.py | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/singlecellmultiomics/fragment/chic.py b/singlecellmultiomics/fragment/chic.py index e4ea4b98..c92bed69 100644 --- a/singlecellmultiomics/fragment/chic.py +++ b/singlecellmultiomics/fragment/chic.py @@ -6,7 +6,7 @@ class CHICFragment(Fragment): def __init__(self, reads, R1_primer_length=4, - R2_primer_length=0, # Is automatically detected now + R2_primer_length=6, # Is automatically detected now assignment_radius=0, umi_hamming_distance=1, invert_strand=False, diff --git a/tests/test_fragment.py b/tests/test_fragment.py index 9ce9e0b2..7971ef59 100644 --- a/tests/test_fragment.py +++ b/tests/test_fragment.py @@ -106,20 +106,32 @@ def get_reads(): ## Act on reads without random primer read_A, read_B = get_reads() frag = Fragment([read_A, read_B], R2_primer_length=10) - self.assertTrue(frag.R2_primer_length==10) + self.assertEqual(frag.R2_primer_length, 10) self.assertFalse(frag.unsafe_trimmed) read_A, read_B = get_reads() frag = CHICFragment([read_A, read_B], R2_primer_length=0) - self.assertTrue(frag.R2_primer_length==0) + self.assertEqual(frag.R2_primer_length, 0) self.assertFalse(frag.unsafe_trimmed) read_A, read_B = get_reads() read_B.set_tag('rS','AATTAA') # Set random primer sequence frag = CHICFragment([read_A, read_B], R2_primer_length=0) - self.assertTrue(frag.R2_primer_length==0) + self.assertEqual(frag.R2_primer_length, 0) self.assertTrue(frag.unsafe_trimmed) + read_A, read_B = get_reads() + read_B.set_tag('MX','scCHIC384C8U3l') + frag = CHICFragment([read_A, read_B]) + self.assertEqual(frag.R2_primer_length, 0) + self.assertFalse(frag.unsafe_trimmed) + + read_A, read_B = get_reads() + read_B.set_tag('MX','scCHIC384C8U3') + frag = CHICFragment([read_A, read_B]) + self.assertEqual(frag.R2_primer_length,6) + self.assertFalse(frag.unsafe_trimmed) + def test_init(self): """Test if the fragment can be initialised""" with pysam.AlignmentFile('./data/mini_nla_test.bam') as f: From 74fa6175c8ae3d57ec1b3526163d416be0e56015 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Fri, 29 Jan 2021 11:15:53 +0100 Subject: [PATCH 10/17] Test with newer python versions --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 31a9eb7e..e1814331 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,8 @@ dist: xenial # required for Python >= 3.7 language: python python: - - "3.6" - - "3.6.1" - "3.7" + - "3.8" # command to install dependencies install: From 653c6dba0df1dacf5370c59fd0bcd89b918c2842 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Fri, 29 Jan 2021 11:21:36 +0100 Subject: [PATCH 11/17] define unused variable --- tests/test_fragment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_fragment.py b/tests/test_fragment.py index 7971ef59..5e1bdc78 100644 --- a/tests/test_fragment.py +++ b/tests/test_fragment.py @@ -8,6 +8,7 @@ import os from singlecellmultiomics.fragment import Fragment, CHICFragment from singlecellmultiomics.utils import create_MD_tag +from singlecellmultiomics.utils import complement """ These tests check if the Molecule module is working correctly """ From 0f5bdf6f6920b22fa08be128b372d30b37bfbd25 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Fri, 29 Jan 2021 13:49:59 +0100 Subject: [PATCH 12/17] Updated dependencies --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6d4daf1f..0d7c3dec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ pysam>=0.15.3 -numpy>=1.16.4 +numpy>=1.16.5 pandas>=0.25.0 colorama pysamiterators>=1.8 diff --git a/setup.py b/setup.py index d9895712..fb17a11b 100644 --- a/setup.py +++ b/setup.py @@ -138,7 +138,7 @@ ], install_requires=[ - 'pysam>=0.15.3','numpy>=1.16.1','pandas>=0.25.0','colorama','pyBigWig', + 'pysam>=0.15.3','numpy>=1.16.5','pandas>=0.25.0','colorama','pyBigWig', 'cutadapt>=2.9', 'pysamiterators>=1.8','more-itertools','matplotlib','tabulate', 'wheel','setuptools>=40.8.0','scikit-learn>=0.21.3','seaborn>=0.11.0', 'statsmodels', 'cached_property', From 7ad45165d55cd338180589af7487065c50c2f272 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Fri, 29 Jan 2021 21:49:26 +0100 Subject: [PATCH 13/17] Use computed fragment length as fS tag value --- singlecellmultiomics/fragment/fragment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecellmultiomics/fragment/fragment.py b/singlecellmultiomics/fragment/fragment.py index d36048a2..865f7f7d 100644 --- a/singlecellmultiomics/fragment/fragment.py +++ b/singlecellmultiomics/fragment/fragment.py @@ -349,7 +349,7 @@ def write_tags(self): if self.has_valid_span(): # Write fragment size: if self.safe_span: - self.set_meta('fS', self.get_fragment_size()) + self.set_meta('fS', self.estimated_length) self.set_meta('fe', self.span[1]) self.set_meta('fs', self.span[2]) else: From 43cc432bdffbfcdd4ca52629c02aeeaacc8f5446 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Sat, 30 Jan 2021 22:48:38 +0100 Subject: [PATCH 14/17] Added --allow-single-end flag --- .../universalBamTagger/tapsTabulator.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/singlecellmultiomics/universalBamTagger/tapsTabulator.py b/singlecellmultiomics/universalBamTagger/tapsTabulator.py index f529100b..a5a94a86 100644 --- a/singlecellmultiomics/universalBamTagger/tapsTabulator.py +++ b/singlecellmultiomics/universalBamTagger/tapsTabulator.py @@ -124,6 +124,11 @@ def finish_bam(output, args, temp_out): '-bamout', type=str, help="optional (tagged) output BAM path") + argparser.add_argument( + '--allow_single_end', + type=str, + help='Allow single end reads') + args = argparser.parse_args() alignments = pysam.AlignmentFile(args.alignmentfile) @@ -180,6 +185,12 @@ def finish_bam(output, args, temp_out): 'min_phred_score':args.min_phred_score } + + if args.allow_single_end: + # Single end base calls are "unsafe", allow them : + molecule_class_args['allow_unsafe_base_calls'] = True + + s = args.moleculeNameSep try: for i, molecule in enumerate(singlecellmultiomics.molecule.MoleculeIterator( From 11ad20968235646553de5c936fbe20b7b04743f8 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Sat, 30 Jan 2021 22:53:02 +0100 Subject: [PATCH 15/17] With correct action --- singlecellmultiomics/universalBamTagger/tapsTabulator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecellmultiomics/universalBamTagger/tapsTabulator.py b/singlecellmultiomics/universalBamTagger/tapsTabulator.py index a5a94a86..00bff11b 100644 --- a/singlecellmultiomics/universalBamTagger/tapsTabulator.py +++ b/singlecellmultiomics/universalBamTagger/tapsTabulator.py @@ -126,7 +126,7 @@ def finish_bam(output, args, temp_out): help="optional (tagged) output BAM path") argparser.add_argument( '--allow_single_end', - type=str, + action='store_true', help='Allow single end reads') args = argparser.parse_args() From 97888179737bbf0565136f87c5705585ff67576a Mon Sep 17 00:00:00 2001 From: BuysDB Date: Sat, 30 Jan 2021 23:05:40 +0100 Subject: [PATCH 16/17] Allow fragment length calculation on single end reads --- singlecellmultiomics/fragment/fragment.py | 11 +++++++++-- .../universalBamTagger/tapsTabulator.py | 16 +++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/singlecellmultiomics/fragment/fragment.py b/singlecellmultiomics/fragment/fragment.py index 865f7f7d..5dc9250b 100644 --- a/singlecellmultiomics/fragment/fragment.py +++ b/singlecellmultiomics/fragment/fragment.py @@ -55,7 +55,8 @@ def __init__(self, reads, mapping_dir=(False, True), max_NUC_stretch: int = None, read_group_format: int = 0, # R1 forward, R2 reverse - library_name: str = None # Overwrites the library name + library_name: str = None, # Overwrites the library name + single_end: bool = False ): """ Initialise Fragment @@ -114,6 +115,7 @@ def __init__(self, reads, self.read_group_format = read_group_format self.max_NUC_stretch = max_NUC_stretch self.qcfail = False + self.single_end = single_end # Span:\ self.span = [None, None, None] @@ -533,8 +535,13 @@ def estimated_length(self) -> int: Obtain the estimated size of the fragment, returns None when estimation is not possible Takes into account removed bases (R2) - Assumes inwards sequencing orientation + Assumes inwards sequencing orientation, except when self.single_end is set """ + if self.single_end: + if self[0] is None: + return None + return self[0].reference_end - self[0].reference_start + if self.has_R1() and self.has_R2(): diff --git a/singlecellmultiomics/universalBamTagger/tapsTabulator.py b/singlecellmultiomics/universalBamTagger/tapsTabulator.py index 00bff11b..c6d75b27 100644 --- a/singlecellmultiomics/universalBamTagger/tapsTabulator.py +++ b/singlecellmultiomics/universalBamTagger/tapsTabulator.py @@ -157,6 +157,11 @@ def finish_bam(output, args, temp_out): else: features = None + fragment_class_args={'umi_hamming_distance': 1, + 'no_umi_cigar_processing':False} + + + molecule_class_args = { 'reference': reference, 'taps': taps, @@ -189,7 +194,7 @@ def finish_bam(output, args, temp_out): if args.allow_single_end: # Single end base calls are "unsafe", allow them : molecule_class_args['allow_unsafe_base_calls'] = True - + fragment_class_args['single_end'] = True s = args.moleculeNameSep try: @@ -199,13 +204,10 @@ def finish_bam(output, args, temp_out): yield_invalid=(output is not None), every_fragment_as_molecule=args.every_fragment_as_molecule, fragment_class=fragment_class, - fragment_class_args={'umi_hamming_distance': 1, - 'no_umi_cigar_processing':False, - - }, + fragment_class_args=fragment_class_args, - molecule_class_args=molecule_class_args, - contig=args.contig)): + molecule_class_args=molecule_class_args, + contig=args.contig)): molecule.set_meta('mi',i) if args.head and (i - 1) >= args.head: From 5186748c9412131d07d4bf2f6d8e585fcf9f7842 Mon Sep 17 00:00:00 2001 From: BuysDB Date: Sat, 30 Jan 2021 23:09:28 +0100 Subject: [PATCH 17/17] Fix MatplotlibDeprecationWarning, make a copy of modified cmap --- singlecellmultiomics/molecule/taps.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/singlecellmultiomics/molecule/taps.py b/singlecellmultiomics/molecule/taps.py index 04163ed9..60f94440 100644 --- a/singlecellmultiomics/molecule/taps.py +++ b/singlecellmultiomics/molecule/taps.py @@ -8,6 +8,7 @@ from singlecellmultiomics.utils.sequtils import complement from itertools import product from matplotlib.pyplot import get_cmap +from copy import copy complement_trans = str.maketrans('ATGC', 'TACG') @@ -58,7 +59,7 @@ def __init__(self, reference=None, reference_variants=None, taps_strand='F', **k self.context_mapping[True][''.join(['C'] + list(x))] = 'H' self.context_mapping[False][''.join(['C'] + list(x))] = 'h' - self.colormap = get_cmap('RdYlBu_r') + self.colormap = copy(get_cmap('RdYlBu_r')) # Make a copy self.colormap.set_bad((0,0,0)) # For reads without C's def position_to_context(