Merge pull request #250 from BuysDB/bamtagmultiome_unmapped

March 2022 update
BuysDB · Mar 22, 2022 · 208a718 · 208a718
2 parents 71df811 + ba775ac
commit 208a718
Show file tree

Hide file tree

Showing 18 changed files with 865 additions and 51 deletions.
diff --git a/TAGS.MD b/TAGS.MD
@@ -6,7 +6,8 @@
 | AQ    | Read2Clipped3primeBasesKmerQuality           | Yes        | No                   |
 | AS    | alignmentScore                               | No         | No                   |
 | BC    | barcode                                      | No         | No                   |
-| BI    | barcodeIndex                                 | No         | No                   |
+| BD    | gatk_insert_deletion_base_qualities          | No         | No                   |
+| BI    | gatk_insert_deletion_base_qualities          | No         | No                   |
 | BK    | isBulk                                       | No         | No                   |
 | BX    | umiCorrected                                 | No         | No                   |
 | BZ    | umiCorrectedQuality                          | Yes        | No                   |
@@ -65,7 +66,9 @@
 | SM    | sampleName                                   | No         | No                   |
 | SP    | IsSpliced                                    | No         | No                   |
 | SQ    | meanBaseQuality                              | No         | No                   |
-| SS    | sampleSequence                               | No         | No                   |
+| TF    | totalAssociatedFragmentsIncludingOverlflow   | No         | No                   |
+| TR    | totalRTreactions                             | No         | No                   |
+| TR    | totalRTreactions                             | No         | No                   |
 | Ti    | tile                                         | No         | No                   |
 | Us    | undigestedSiteCount                          | No         | No                   |
 | XA    | alternativeAlignmentHits                     | No         | No                   |
@@ -78,8 +81,12 @@
 | aQ    | Read1Clipped3primeBasesKmerQuality           | Yes        | No                   |
 | aa    | rawIlluminaIndexSequence                     | No         | No                   |
 | af    | associatedFragmentCount                      | No         | No                   |
+| af    | totalAssociatedFragmentsInBam                | No         | No                   |
 | ah    | hammingDistanceToIndex                       | No         | No                   |
+| ap    | phasedAllelicSNVs                            | No         | No                   |
+| au    | phasednUnknownAllelicOriginSNVs              | No         | No                   |
 | bc    | rawBarcode                                   | No         | No                   |
+| bi    | barcodeIndex                                 | No         | No                   |
 | dt    | assingedDataType                             | No         | No                   |
 | e1    | Read1Clipped5primeBasesKmer                  | No         | No                   |
 | eB    | Read1Clipped5primeCycle                      | No         | No                   |
@@ -95,8 +102,8 @@
 | lh    | ligationOverhangSequence                     | No         | No                   |
 | lq    | ligationOverhangQuality                      | Yes        | No                   |
 | nM    | editDistanceToReference                      | No         | No                   |
-| rP    | randomPrimerSequence                         | No         | No                   |
-| rS    | randomPrimerStart                            | No         | No                   |
+| rP    | randomPrimerStart                            | No         | No                   |
+| rS    | randomPrimerSequence                         | No         | No                   |
 | rd    | rtDuplicateIndex                             | No         | No                   |
 | rt    | rtReactionIndex                              | No         | No                   |
 | sH    | Total CHH methylated                         | No         | No                   |
@@ -105,5 +112,6 @@
 | sh    | Total CHH unmethylated                       | No         | No                   |
 | sx    | Total CHG unmethylated                       | No         | No                   |
 | sz    | Total CPG unmethylated                       | No         | No                   |
+| tu    | second_umi                                   | No         | No                   |
 | uC    | totalUnmethylated                            | No         | No                   |
-| uL    | umiLength                                    | No         | No                   |
+| uL    | umiLength                                    | No         | No                   |
diff --git a/setup.py b/setup.py
@@ -116,8 +116,11 @@
 
         # Library processing:
         'singlecellmultiomics/libraryProcessing/libraryStatistics.py',
+        'singlecellmultiomics/libraryProcessing/scsortchicstats.py',
         'singlecellmultiomics/libraryDetection/archivestats.py',
         'singlecellmultiomics/alleleTools/heterozygousSNPedit.py',
+        'singlecellmultiomics/libraryProcessing/scsortchicfeaturedensitytable.py',
+        'singlecellmultiomics/libraryProcessing/scsortchicqc.py',
 
         # Feature conversion:
         'singlecellmultiomics/features/exonGTFtoIntronGTF.py',

diff --git a/singlecellmultiomics/bamProcessing/bamAnalyzeCutDistances.py b/singlecellmultiomics/bamProcessing/bamAnalyzeCutDistances.py
@@ -62,7 +62,7 @@ def generate_prefix(prefix, prefix_with_region, contig, start, end ):
         return prefix
 
 
-def get_sc_cut_dictionary(bam_path: str, filter_function=None, strand_specific=False, prefix_with_bam=False, regions=None, prefix_with_region=False):
+def get_sc_cut_dictionary(bam_path: str, filter_function=None, strand_specific=False, prefix_with_bam=False, regions=None, prefix_with_region=False, n_threads=None):
     """
     Generates cut distribution dictionary  (contig)->sample->position->obs
 
@@ -79,7 +79,7 @@ def get_sc_cut_dictionary(bam_path: str, filter_function=None, strand_specific=F
 
 
 
-    with Pool() as workers:
+    with Pool(n_threads) as workers:
         for bam_path in bam_paths:
             if prefix_with_bam:
                 prefix = bam_path.split('/')[-1].replace('.bam','')
@@ -99,7 +99,7 @@ def get_sc_cut_dictionary(bam_path: str, filter_function=None, strand_specific=F
                         strand_specific,
                         filter_function,
                         generate_prefix(prefix,prefix_with_region,contig,start,end)
-                        , start, end)
+                        , start, end, n_threads)
                         for contig, start, end in regions )):
                     # Perform merge:
                     if not contig in cut_sites:
@@ -204,7 +204,7 @@ def strict_read_counts_function(read):
 
 def _get_sc_cut_dictionary(args):
 
-    bam, contig, strand_specific, filter_function, prefix, start, end = args
+    bam, contig, strand_specific, filter_function, prefix, start, end, n_threads = args
     cut_positions = defaultdict(Counter)
     with pysam.AlignmentFile(bam) as alignments:
         for read in alignments.fetch(contig, start, end):
@@ -416,6 +416,7 @@ def function_to_fit(xdata, period, offset, amplitude, decay, mean ):
     argparser.add_argument('-min_region_len', type=int, default=1000)
     argparser.add_argument('--legacy', action='store_true', help='Create legacy unstranded anaylsis plots and files')
     argparser.add_argument('-max_distance', type=int,default=2000, help='Maximum distance in both plots and output tables')
+    argparser.add_argument('-t', type=int,default=None, help='Max processes')
     args = argparser.parse_args()
 
     if args.regions is not None:
@@ -460,7 +461,7 @@ def function_to_fit(xdata, period, offset, amplitude, decay, mean ):
         analyse(args.alignmentfiles[0], args.o, create_plot=True, verbose=True,strand_specific=False,max_distance=args.max_distance)
 
     # Stranded analysis:
-    sc_cut_dict_stranded = get_sc_cut_dictionary( args.alignmentfiles,strand_specific=True,filter_function=strict_read_counts_function, regions=regions)
+    sc_cut_dict_stranded = get_sc_cut_dictionary( args.alignmentfiles,strand_specific=True,filter_function=strict_read_counts_function, regions=regions,  n_threads=args.t)
     distance_counter_fwd_above, distance_counter_fwd_below, distance_counter_rev_above, distance_counter_rev_below = get_stranded_pairwise_counts(sc_cut_dict_stranded)
 
     # Write tables:
@@ -477,7 +478,7 @@ def function_to_fit(xdata, period, offset, amplitude, decay, mean ):
     #################
     # Unstranded density analysis:
     prefix_with_bam=False if len(args.alignmentfiles)==1 else True
-    sc_cut_dict = get_sc_cut_dictionary( args.alignmentfiles,strand_specific=False,filter_function=strict_read_counts_function, prefix_with_bam=prefix_with_bam, regions=regions)
+    sc_cut_dict = get_sc_cut_dictionary( args.alignmentfiles,strand_specific=False,filter_function=strict_read_counts_function, prefix_with_bam=prefix_with_bam, regions=regions,  n_threads=args.t)
     cpr = get_r1_counts_per_cell(args.alignmentfiles, prefix_with_bam=prefix_with_bam)
     counts = pd.Series(cpr).sort_values()
     print(counts)
@@ -505,7 +506,7 @@ def get_commands(one_contig=None):
     # This is a histogram of the amount of observed fragments at distances x:
     obs = defaultdict(lambda: np.zeros(n_bins, dtype=np.int64))
     total_tests = Counter()  # cell -> tests
-    with Pool() as workers:
+    with Pool(args.t) as workers:
         for cell, cell_obs, n_tests in workers.imap_unordered(
                 _cuts_to_observation_vector,
 

diff --git a/singlecellmultiomics/bamProcessing/bamDinucleotideDistribution.py b/singlecellmultiomics/bamProcessing/bamDinucleotideDistribution.py
@@ -171,8 +171,9 @@ def create_dinuc_plot(s=-250, e=170):
         atax.set_ylim(at.mean()-extra, at.mean()+extra)
         gcax.set_ylim(gc.mean()-extra, gc.mean()+extra)
 
-        for x in range(s,e,10):
-            atax.axvline(x+5,c='grey',lw=0.5, alpha=0.5)
+        if e-s < 2000:
+            for x in range(s,e,10):
+                atax.axvline(x+5,c='grey',lw=0.5, alpha=0.5)
 
         gcax.set_ylabel('Fraction CC+GG+GC')
         atax.set_ylabel('Fraction AA+AT+TA')

diff --git a/singlecellmultiomics/bamProcessing/bamExtractSamples.py b/singlecellmultiomics/bamProcessing/bamExtractSamples.py
@@ -8,7 +8,7 @@
 import collections
 from singlecellmultiomics.utils.path import get_valid_filename
 
-def extract_samples( input_bam_path, output_path, capture_samples, head=None ):
+def extract_samples( input_bam_path, output_path, capture_samples, head=None, write_group_rg=False, rg_group_prefix=None ):
     """
     Extract samples from a bam file
 
@@ -23,29 +23,56 @@ def extract_samples( input_bam_path, output_path, capture_samples, head=None ):
 
     """
     bamFile = pysam.AlignmentFile(input_bam_path, "rb")
-    header = bamFile.header.copy()
 
     # Write to multiple files:
     output_handles = {}
     print('Groups:')
     for group in capture_samples:
 
+        header = bamFile.header.copy()
+
+        if write_group_rg:
+
+            if rg_group_prefix:
+                rg = rg_group_prefix + group
+            else:
+                rg = group
+            header = header.to_dict()
+            header['RG'] = [{'ID': rg,
+            'LB': rg,
+            'PL': 'ILLUMINA',
+            'SM': rg,
+            'PU': '1'}] # reset read group header
+
         output_handles[group] = pysam.AlignmentFile(output_path.replace('.bam',f'{group}.bam'), "wb", header=header)
         print(f'\t{group}\t{output_handles[group].filename.decode()}')
 
     sample2handle = {}
+    sample2group = {}
     for group,samples in capture_samples.items():
         for sample in samples:
+            if sample in sample2handle:
+                raise ValueError(f'The sample {sample} is assigned to more than one group at least:{group} and {sample2group[sample]}')
+            print(f'\t{sample}\t->{group}')
             sample2handle[sample] = output_handles[group]
+            sample2group[sample] = group
 
     written = 0
     for r in bamFile:
-
+        sm = r.get_tag('SM')
         try:
-            sample2handle[r.get_tag('SM')].write(r)
+            handle = sample2handle[sm]
         except KeyError:
             continue
 
+        if write_group_rg:
+            if rg_group_prefix:
+                r.set_tag('RG', rg_group_prefix + sample2group[sm] )
+            else:
+                r.set_tag('RG', sample2group[sm] )
+
+
+        handle.write(r)
         written += 1
         if head is not None and written >= head:
             break
@@ -77,6 +104,8 @@ def extract_samples( input_bam_path, output_path, capture_samples, head=None ):
         action='store_true',
         help='Force overwrite of existing files')
     argparser.add_argument('-head', type=int)
+    argparser.add_argument('--write_group_rg', action='store_true', help='Set the group label as read-group, requires second GROUP column in the input file')
+    argparser.add_argument('-rg_group_prefix', type=str, help='Prepend this string tot the read group. Requires --write_group_rg ')
     args = argparser.parse_args()
 
     if os.path.exists(args.o) and not args.f:
@@ -103,4 +132,4 @@ def extract_samples( input_bam_path, output_path, capture_samples, head=None ):
                 raise ValueError("Please supply a file with one or two columns: [SAMPLE], or [SAMPLE]{tab}[GROUP]")
             capture_samples[group].add( parts[0] )
 
-    extract_samples(args.bamfile, args.o, capture_samples, head=args.head )
+    extract_samples(args.bamfile, args.o, capture_samples, head=args.head, write_group_rg=args.write_group_rg, rg_group_prefix=args.rg_group_prefix )
diff --git a/singlecellmultiomics/bamProcessing/bamFeatureDensityVisualisation.py b/singlecellmultiomics/bamProcessing/bamFeatureDensityVisualisation.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import os
 import sys

diff --git a/singlecellmultiomics/bamProcessing/bamFunctions.py b/singlecellmultiomics/bamProcessing/bamFunctions.py
@@ -24,6 +24,9 @@ def get_index_path(bam_path: str):
     for p in [bam_path+'.bai', bam_path.replace('.bam','.bai')]:
         if os.path.exists(p):
             return p
+    for p in [bam_path+'.csi', bam_path.replace('.bam','.csi')]:
+        if os.path.exists(p):
+            return p
     return None
 
 

diff --git a/singlecellmultiomics/features/features.py b/singlecellmultiomics/features/features.py
@@ -327,7 +327,7 @@ def loadBED(self, path, ignChr=False, parseBlocks=True):
             # chr1   14662   187832  calJac3:ENSCJAT00000061222.1-1.1    943 -
             # 187832  187832  0   9   5,129,69,110,42,23,133,202,78,
             # 0,38,307,1133,1243,1944,1970,172713,173092,
-            for line in f:
+            for line_idx,line in enumerate(f):
                 if line.split()[0] == "track":
                     continue
                 blockAvail = False
@@ -347,8 +347,11 @@ def loadBED(self, path, ignChr=False, parseBlocks=True):
                 elif len(parts)>=4:
                     chrom, chromStart, chromEnd, value = parts[:4]
                     name = value
+                elif len(parts)==3:
+                    chrom, chromStart, chromEnd = parts[:3]
+                    name = str(line_idx)
                 else:
-                    raise ValueError('Could not read the supplied bed file, it has too little columns, expecting at least 4 columns: contig,start,end,value')
+                    raise ValueError('Could not read the supplied bed file, it has too little columns, expecting at least 3 columns: contig,start,end[,value]')
 
                 chrom = self.remapKeys.get(chrom, chrom)
                 chrom = chrom if ignChr == False else chrom.replace('chr', '')

diff --git a/singlecellmultiomics/libraryProcessing/__init__.py b/singlecellmultiomics/libraryProcessing/__init__.py
@@ -0,0 +1 @@
+from .sample_sheet import SampleSheet
diff --git a/singlecellmultiomics/libraryProcessing/sample_sheet.py b/singlecellmultiomics/libraryProcessing/sample_sheet.py
@@ -0,0 +1,100 @@
+import json
+
+def mark_autodetect(libname):
+    lib = libname.lower()
+    if 'k9me3' in lib:
+        return 'H3K9me3'
+    if 'k4me1' in lib:
+        return 'H3K4me1'
+    if 'k4me3' in lib:
+        return 'H3K4me3'
+    if 'k27me3' in lib:
+        return 'H3K27me3'
+    if 'k36me3' in lib:
+        return 'H3K36me3'
+
+    return 'NA'
+
+
+class SampleSheet:
+    """
+    Class for reading and writing sample sheet information
+    """
+    def __init__(self, path=None):
+        self.data = {}
+        if path is not None:
+            self.load_json(path)
+
+    def load_json(self,sample_sheet_location,merge=False):
+        """
+        Load the data from the specified json file.
+        When merge=True the data will be added to the current data (used for merging sheets)
+        Duplicates will be overwritten by the last load file
+        """
+        with open(sample_sheet_location) as f:
+            if merge:
+                for k,v in json.load(f).items():
+                    if k not in self.data:
+                        self.data[k] = v
+                    else:
+                        self.data[k].update(v)
+            else:
+                self.data = json.load(f)
+    def get(self,key,default):
+        return self.data.get(key,default)
+
+    def __getitem__(self,k):
+        return self.data[k]
+
+    def layout_well2index(self, layout_name):
+        """ Get the well to index mapping for the supplied layout """
+        return {k:tuple(v) for k,v in self['well2coord'][self['layout_format'][layout_name]].items()}
+
+    def index_to_condition(self, layout_name):
+        well_to_condition = self['layouts'][layout_name]
+        return {self['well2index'][self['layout_format'][layout_name]][k]:v for k,v in well_to_condition.items()}
+
+
+    def index2well(self, layout_name: str) -> dict:
+        """
+        Returns a dictionary index->well
+        {1: 'A1',
+         2: 'A2',
+         3: 'A3',
+         4: 'A4',
+         5: 'A5',
+        """
+        return {v:k for k,v in self['well2index'][self['layout_format'][layout_name]].items() }
+
+    @property
+    def libraries(self):
+
+        libraries = set()
+        for k in ['libtype','marks']:
+            libraries.update( set(self.get(k,dict()).keys()))
+        return libraries
+
+
+    def drop_library(self, library):
+        if type(library) is list:
+            for l in library:
+                self.drop_library(l)
+            return
+        for main_key in ['marks','condition','libtype','library_layout']:
+            if main_key not in self.data:
+                continue
+            if library in self[main_key]:
+                del self[main_key][library]
+
+    def drop_mark(self, mark):
+        """ Remove all libraries with the given mark """
+        if type(mark) is list:
+            for m in mark:
+                self.drop_mark(m)
+            return
+        to_prune = []
+        for sample, _mark in self['marks'].items():
+            if mark==_mark:
+                to_prune.append(sample)
+
+        self.drop_library(to_prune)