Merge pull request #216 from BuysDB/fs_additions

Fs additions
BuysDB · Jan 30, 2021 · b8bf8fe · b8bf8fe
2 parents 591f6ef + 5186748
commit b8bf8fe
Show file tree

Hide file tree

Showing 13 changed files with 293 additions and 22 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,9 +1,8 @@
 dist: xenial   # required for Python >= 3.7
 language: python
 python:
-  - "3.6"
-  - "3.6.1"
   - "3.7"
+  - "3.8"
 
 # command to install dependencies
 install:

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 pysam>=0.15.3
-numpy>=1.16.4
+numpy>=1.16.5
 pandas>=0.25.0
 colorama
 pysamiterators>=1.8

diff --git a/setup.py b/setup.py
@@ -138,7 +138,7 @@
         ],
 
   install_requires=[
-       'pysam>=0.15.3','numpy>=1.16.1','pandas>=0.25.0','colorama','pyBigWig',
+       'pysam>=0.15.3','numpy>=1.16.5','pandas>=0.25.0','colorama','pyBigWig',
        'cutadapt>=2.9',
        'pysamiterators>=1.8','more-itertools','matplotlib','tabulate',
        'wheel','setuptools>=40.8.0','scikit-learn>=0.21.3','seaborn>=0.11.0', 'statsmodels', 'cached_property',

diff --git a/singlecellmultiomics/fragment/chic.py b/singlecellmultiomics/fragment/chic.py
@@ -6,14 +6,23 @@ class CHICFragment(Fragment):
     def __init__(self,
                  reads,
                  R1_primer_length=4,
-                 R2_primer_length=6,
+                 R2_primer_length=6, # Is automatically detected now
                  assignment_radius=0,
                  umi_hamming_distance=1,
                  invert_strand=False,
                  no_umi_cigar_processing=False,
                  **kwargs
                  ):
         self.invert_strand = invert_strand
+
+        # Perform random primer autodect,
+        # When the demux profile is MX:Z:scCHIC384C8U3l, then there is no random primer
+        for read in reads:
+            if read is not None and read.has_tag('MX'):
+                if read.get_tag('MX')[-1]=='l':
+                    R2_primer_length = 0
+                break
+
         Fragment.__init__(self,
                           reads,
                           assignment_radius=assignment_radius,

diff --git a/singlecellmultiomics/fragment/fragment.py b/singlecellmultiomics/fragment/fragment.py
@@ -55,7 +55,8 @@ def __init__(self, reads,
                  mapping_dir=(False, True),
                  max_NUC_stretch: int = None,
                  read_group_format: int = 0,  # R1 forward, R2 reverse
-                 library_name: str = None # Overwrites the library name
+                 library_name: str = None, # Overwrites the library name
+                 single_end: bool = False
                  ):
         """
         Initialise Fragment
@@ -74,7 +75,8 @@ def __init__(self, reads,
                     length of R1 primer, these bases are not taken into account when calculating a consensus
 
                 R2_primer_length(int):
-                    length of R2 primer, these bases are not taken into account when calculating a consensus
+                    length of R2 primer, these bases are not taken into account when calculating a consensus, this length is auto-detected/ overwritten when the rS tag is set
+
 
                 tag_definitions(list):
                     sam TagDefinitions
@@ -113,6 +115,7 @@ def __init__(self, reads,
         self.read_group_format = read_group_format
         self.max_NUC_stretch = max_NUC_stretch
         self.qcfail = False
+        self.single_end = single_end
 
         # Span:\
         self.span = [None, None, None]
@@ -348,7 +351,7 @@ def write_tags(self):
         if self.has_valid_span():
             # Write fragment size:
             if self.safe_span:
-                self.set_meta('fS', self.get_fragment_size())
+                self.set_meta('fS', self.estimated_length)
                 self.set_meta('fe', self.span[1])
                 self.set_meta('fs', self.span[2])
             else:
@@ -405,6 +408,10 @@ def get_random_primer_hash(self):
 
         if R2 is None or R2.query_sequence is None:
             return None, None, None
+
+        if self.R2_primer_length == 0 and self.random_primer_sequence is None:
+            return None, None, None
+
         # The read was not mapped
         if R2.is_unmapped:
             # Guess the orientation does not matter
@@ -528,8 +535,13 @@ def estimated_length(self) -> int:
         Obtain the estimated size of the fragment,
         returns None when estimation is not possible
         Takes into account removed bases (R2)
-        Assumes inwards sequencing orientation
+        Assumes inwards sequencing orientation, except when self.single_end is set
         """
+        if self.single_end:
+            if self[0] is None:
+                return None
+            return self[0].reference_end - self[0].reference_start
+
 
         if self.has_R1() and self.has_R2():
 

diff --git a/singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py b/singlecellmultiomics/modularDemultiplexer/demultiplexModules/scCHIC.py
@@ -69,6 +69,71 @@ def demultiplex(self, records, **kwargs):
         #taggedRecords[0].qualities = taggedRecords[0].qualities[1:]
         return taggedRecords
 
+class SCCHIC_384w_c8_u3_direct_ligation(UmiBarcodeDemuxMethod):
+    def __init__(self, barcodeFileParser, random_primer_read=None,random_primer_length=None, **kwargs):
+        self.barcodeFileAlias = 'maya_384NLA'
+        UmiBarcodeDemuxMethod.__init__(
+            self,
+            umiRead=0,
+            umiStart=0,
+            umiLength=3,
+            barcodeRead=0,
+            barcodeStart=3,
+            barcodeLength=8,
+            random_primer_read=random_primer_read,
+            random_primer_length=random_primer_length,
+            barcodeFileAlias=self.barcodeFileAlias,
+            barcodeFileParser=barcodeFileParser,
+            **kwargs)
+        self.shortName = 'scCHIC384C8U3l'
+        self.longName = 'Single cell CHIC, 384well CB: 8bp UMI: 3bp, no RP'
+        self.autoDetectable = False
+        self.description = '384 well format. 3bp umi followed by 8bp barcode and a single A. R2 does not contain a random primer'
+
+        self.sequenceCapture[0] = slice(
+            self.barcodeLength + self.umiLength + 1,
+            None)  # dont capture the first base
+
+    def demultiplex(self, records, **kwargs):
+
+        if kwargs.get(
+                'probe') and records[0].sequence[self.barcodeLength + self.umiLength] != 'T':
+            raise NonMultiplexable
+
+
+        # add first 2 bases as ligation tag:
+        ligation_start = self.barcodeLength + self.umiLength
+        ligation_end = ligation_start + 2
+        ligation_sequence = records[0].sequence[ligation_start:ligation_end]
+        ligation_qualities = records[0].qual[ligation_start:ligation_end]
+
+        taggedRecords = UmiBarcodeDemuxMethod.demultiplex(
+            self, records, **kwargs)
+
+        taggedRecords[0].addTagByTag(
+            'lh',
+            ligation_sequence,
+            isPhred=False,
+            make_safe=False)
+        taggedRecords[0].addTagByTag(
+            'lq',
+            ligation_qualities,
+            isPhred=True,
+            make_safe=False)
+        taggedRecords[1].addTagByTag(
+            'lh',
+            ligation_sequence,
+            isPhred=False,
+            make_safe=False)
+        taggedRecords[1].addTagByTag(
+            'lq',
+            ligation_qualities,
+            isPhred=True,
+            make_safe=False)
+        #taggedRecords[0].sequence = taggedRecords[0].sequence[1:]
+        #taggedRecords[0].qualities = taggedRecords[0].qualities[1:]
+        return taggedRecords
+
 
 
 

diff --git a/singlecellmultiomics/modularDemultiplexer/demultiplexingStrategyLoader.py b/singlecellmultiomics/modularDemultiplexer/demultiplexingStrategyLoader.py
@@ -61,6 +61,8 @@ def __init__(
             dm.SCCHIC_384w_c8_u3,
             dm.SCCHIC_384w_c8_u3_cs2,
             dm.SCCHIC_384w_c8_u3_pdt,
+            dm.SCCHIC_384w_c8_u3_direct_ligation,
+
             dm.MSPJI_c8_u3,
             dm.ScartraceR2,
             dm.ScartraceR1,

diff --git a/singlecellmultiomics/molecule/molecule.py b/singlecellmultiomics/molecule/molecule.py
@@ -547,6 +547,7 @@ def write_tags(self):
                         read.is_duplicate = True
 
         # Write RT reaction tags (rt: rt reaction index, rd rt duplicate index)
+        # This is only required for fragments which have defined random primers
         rt_reaction_index = None
         for rt_reaction_index, ( (contig, random_primer_start, random_primer_sequence), frags) in enumerate(
                 self.get_rt_reactions().items()):
@@ -1672,6 +1673,22 @@ def aligned_length(self) -> int:
         else:
             return None
 
+    @property
+    def is_completely_matching(self) -> bool:
+        """
+        Checks if all associated reads are completely mapped:
+        checks if all cigar operations are M,
+        Returns True when all cigar operations are M, False otherwise
+        """
+
+        return all(
+                (
+                     all(
+                     [ (operation==0)
+                        for operation, amount in read.cigartuples] )
+                for read in self.iter_reads()))
+
+
     @property
     def estimated_max_length(self) -> int:
         """
@@ -1777,7 +1794,7 @@ def can_be_yielded(self, chromosome, position):
                        self.cache_size *
                        0.5)
 
-    def get_rt_reactions(self):
+    def get_rt_reactions(self) -> dict:
         """Obtain RT reaction dictionary
 
         returns:
@@ -1824,17 +1841,24 @@ def get_mean_rt_fragment_size(self):
             self.get_rt_reaction_fragment_sizes()
         )
 
-    def write_pysam(self, target_file, consensus=False, no_source_reads=False, consensus_name=None):
+    def write_pysam(self, target_file, consensus=False, no_source_reads=False, consensus_name=None, consensus_read_callback=None, consensus_read_callback_kwargs=None):
         """Write all associated reads to the target file
 
         Args:
             target_file (pysam.AlignmentFile) : Target file
             consensus (bool) : write consensus
             no_source_reads (bool) : while in consensus mode, don't write original reads
+            consensus_read_callback (function) : this function is called with every consensus read as an arguments
+            consensus_read_callback_kwargs (dict) : arguments to pass to the callback function
         """
         if consensus:
             reads = self.deduplicate_majority(target_file,
                                               f'molecule_{uuid4()}' if consensus_name is None else consensus_name)
+            if consensus_read_callback is not None:
+                if consensus_read_callback_kwargs is not None:
+                    consensus_read_callback(reads, **consensus_read_callback_kwargs)
+                else:
+                    consensus_read_callback(reads)
 
             for read in reads:
                 target_file.write(read)

diff --git a/singlecellmultiomics/molecule/taps.py b/singlecellmultiomics/molecule/taps.py
@@ -8,6 +8,7 @@
 from singlecellmultiomics.utils.sequtils import complement
 from itertools import product
 from matplotlib.pyplot import get_cmap
+from copy import copy
 complement_trans = str.maketrans('ATGC', 'TACG')
 
 
@@ -58,7 +59,7 @@ def __init__(self, reference=None, reference_variants=None, taps_strand='F', **k
             self.context_mapping[True][''.join(['C'] + list(x))] = 'H'
             self.context_mapping[False][''.join(['C'] + list(x))] = 'h'
 
-        self.colormap = get_cmap('RdYlBu_r')
+        self.colormap = copy(get_cmap('RdYlBu_r')) # Make a copy 
         self.colormap.set_bad((0,0,0)) # For reads without C's
 
     def position_to_context(

diff --git a/singlecellmultiomics/universalBamTagger/tapsTabulator.py b/singlecellmultiomics/universalBamTagger/tapsTabulator.py
@@ -124,6 +124,11 @@ def finish_bam(output, args, temp_out):
         '-bamout',
         type=str,
         help="optional (tagged) output BAM path")
+    argparser.add_argument(
+        '--allow_single_end',
+        action='store_true',
+        help='Allow single end reads')
+
     args = argparser.parse_args()
     alignments = pysam.AlignmentFile(args.alignmentfile)
 
@@ -152,6 +157,11 @@ def finish_bam(output, args, temp_out):
     else:
         features = None
 
+    fragment_class_args={'umi_hamming_distance': 1,
+                         'no_umi_cigar_processing':False}
+
+
+
     molecule_class_args = {
         'reference': reference,
         'taps': taps,
@@ -180,6 +190,12 @@ def finish_bam(output, args, temp_out):
         'min_phred_score':args.min_phred_score
         }
 
+
+    if args.allow_single_end:
+        # Single end base calls are "unsafe", allow them :
+        molecule_class_args['allow_unsafe_base_calls'] = True
+        fragment_class_args['single_end'] = True
+
     s = args.moleculeNameSep
     try:
         for i, molecule in enumerate(singlecellmultiomics.molecule.MoleculeIterator(
@@ -188,13 +204,10 @@ def finish_bam(output, args, temp_out):
             yield_invalid=(output is not None),
             every_fragment_as_molecule=args.every_fragment_as_molecule,
             fragment_class=fragment_class,
-            fragment_class_args={'umi_hamming_distance': 1,
-                                 'no_umi_cigar_processing':False,
-
-                                 },
+            fragment_class_args=fragment_class_args,
 
-                molecule_class_args=molecule_class_args,
-                contig=args.contig)):
+            molecule_class_args=molecule_class_args,
+            contig=args.contig)):
 
             molecule.set_meta('mi',i)
             if args.head and (i - 1) >= args.head:

diff --git a/singlecellmultiomics/utils/sequtils.py b/singlecellmultiomics/utils/sequtils.py
@@ -25,8 +25,32 @@ def prefetch(self, contig, start, end):
 
 
 
+def get_chromosome_number(chrom: str) -> int:
+    """
+    Get chromosome number (index) of the supplied chromosome:
+        '1' -> 1, chr1 -> 1, returns -1 when not available, chrM -> -1
+    """
+    try:
+        return int(chrom.replace('chr',''))
+    except Exception as e:
+        return -1
+
+def is_autosome(chrom: str) -> bool:
+    """ Returns True when the chromsome is an autosomal chromsome,
+    not an alternative allele, mitochrondrial or sex chromosome
+
+    Args:
+        chrom(str) : chromosome name
+
+    Returns:
+        is_main(bool) : True when the chromsome is an autosome
+
+    """
+    return is_main_chromosome(chrom) and get_chromosome_number(chrom)!=-1
+
+
 
-def is_main_chromosome(chrom):
+def is_main_chromosome(chrom: str) -> bool:
     """ Returns True when the chromsome is a main chromsome,
     not an alternative or other
 
@@ -42,7 +66,7 @@ def is_main_chromosome(chrom):
         return False
     return True
 
-def get_contig_list_from_fasta(fasta_path, with_length=False):
+def get_contig_list_from_fasta(fasta_path: str, with_length: bool=False) -> list:
     """Obtain list of contigs froma  fasta file,
         all alternative contigs are pooled into the string MISC_ALT_CONTIGS_SCMO
 

diff --git a/singlecellmultiomics/version.py b/singlecellmultiomics/version.py
@@ -1 +1 @@
-__version__ = '0.1.17'
+__version__ = '0.1.18'