Skip to content

Commit

Permalink
Merge pull request #250 from BuysDB/bamtagmultiome_unmapped
Browse files Browse the repository at this point in the history
March 2022 update
  • Loading branch information
BuysDB authored Mar 22, 2022
2 parents 71df811 + ba775ac commit 208a718
Show file tree
Hide file tree
Showing 18 changed files with 865 additions and 51 deletions.
18 changes: 13 additions & 5 deletions TAGS.MD
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
| AQ | Read2Clipped3primeBasesKmerQuality | Yes | No |
| AS | alignmentScore | No | No |
| BC | barcode | No | No |
| BI | barcodeIndex | No | No |
| BD | gatk_insert_deletion_base_qualities | No | No |
| BI | gatk_insert_deletion_base_qualities | No | No |
| BK | isBulk | No | No |
| BX | umiCorrected | No | No |
| BZ | umiCorrectedQuality | Yes | No |
Expand Down Expand Up @@ -65,7 +66,9 @@
| SM | sampleName | No | No |
| SP | IsSpliced | No | No |
| SQ | meanBaseQuality | No | No |
| SS | sampleSequence | No | No |
| TF | totalAssociatedFragmentsIncludingOverlflow | No | No |
| TR | totalRTreactions | No | No |
| TR | totalRTreactions | No | No |
| Ti | tile | No | No |
| Us | undigestedSiteCount | No | No |
| XA | alternativeAlignmentHits | No | No |
Expand All @@ -78,8 +81,12 @@
| aQ | Read1Clipped3primeBasesKmerQuality | Yes | No |
| aa | rawIlluminaIndexSequence | No | No |
| af | associatedFragmentCount | No | No |
| af | totalAssociatedFragmentsInBam | No | No |
| ah | hammingDistanceToIndex | No | No |
| ap | phasedAllelicSNVs | No | No |
| au | phasednUnknownAllelicOriginSNVs | No | No |
| bc | rawBarcode | No | No |
| bi | barcodeIndex | No | No |
| dt | assingedDataType | No | No |
| e1 | Read1Clipped5primeBasesKmer | No | No |
| eB | Read1Clipped5primeCycle | No | No |
Expand All @@ -95,8 +102,8 @@
| lh | ligationOverhangSequence | No | No |
| lq | ligationOverhangQuality | Yes | No |
| nM | editDistanceToReference | No | No |
| rP | randomPrimerSequence | No | No |
| rS | randomPrimerStart | No | No |
| rP | randomPrimerStart | No | No |
| rS | randomPrimerSequence | No | No |
| rd | rtDuplicateIndex | No | No |
| rt | rtReactionIndex | No | No |
| sH | Total CHH methylated | No | No |
Expand All @@ -105,5 +112,6 @@
| sh | Total CHH unmethylated | No | No |
| sx | Total CHG unmethylated | No | No |
| sz | Total CPG unmethylated | No | No |
| tu | second_umi | No | No |
| uC | totalUnmethylated | No | No |
| uL | umiLength | No | No |
| uL | umiLength | No | No |
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,11 @@

# Library processing:
'singlecellmultiomics/libraryProcessing/libraryStatistics.py',
'singlecellmultiomics/libraryProcessing/scsortchicstats.py',
'singlecellmultiomics/libraryDetection/archivestats.py',
'singlecellmultiomics/alleleTools/heterozygousSNPedit.py',
'singlecellmultiomics/libraryProcessing/scsortchicfeaturedensitytable.py',
'singlecellmultiomics/libraryProcessing/scsortchicqc.py',

# Feature conversion:
'singlecellmultiomics/features/exonGTFtoIntronGTF.py',
Expand Down
15 changes: 8 additions & 7 deletions singlecellmultiomics/bamProcessing/bamAnalyzeCutDistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def generate_prefix(prefix, prefix_with_region, contig, start, end ):
return prefix


def get_sc_cut_dictionary(bam_path: str, filter_function=None, strand_specific=False, prefix_with_bam=False, regions=None, prefix_with_region=False):
def get_sc_cut_dictionary(bam_path: str, filter_function=None, strand_specific=False, prefix_with_bam=False, regions=None, prefix_with_region=False, n_threads=None):
"""
Generates cut distribution dictionary (contig)->sample->position->obs
Expand All @@ -79,7 +79,7 @@ def get_sc_cut_dictionary(bam_path: str, filter_function=None, strand_specific=F



with Pool() as workers:
with Pool(n_threads) as workers:
for bam_path in bam_paths:
if prefix_with_bam:
prefix = bam_path.split('/')[-1].replace('.bam','')
Expand All @@ -99,7 +99,7 @@ def get_sc_cut_dictionary(bam_path: str, filter_function=None, strand_specific=F
strand_specific,
filter_function,
generate_prefix(prefix,prefix_with_region,contig,start,end)
, start, end)
, start, end, n_threads)
for contig, start, end in regions )):
# Perform merge:
if not contig in cut_sites:
Expand Down Expand Up @@ -204,7 +204,7 @@ def strict_read_counts_function(read):

def _get_sc_cut_dictionary(args):

bam, contig, strand_specific, filter_function, prefix, start, end = args
bam, contig, strand_specific, filter_function, prefix, start, end, n_threads = args
cut_positions = defaultdict(Counter)
with pysam.AlignmentFile(bam) as alignments:
for read in alignments.fetch(contig, start, end):
Expand Down Expand Up @@ -416,6 +416,7 @@ def function_to_fit(xdata, period, offset, amplitude, decay, mean ):
argparser.add_argument('-min_region_len', type=int, default=1000)
argparser.add_argument('--legacy', action='store_true', help='Create legacy unstranded anaylsis plots and files')
argparser.add_argument('-max_distance', type=int,default=2000, help='Maximum distance in both plots and output tables')
argparser.add_argument('-t', type=int,default=None, help='Max processes')
args = argparser.parse_args()

if args.regions is not None:
Expand Down Expand Up @@ -460,7 +461,7 @@ def function_to_fit(xdata, period, offset, amplitude, decay, mean ):
analyse(args.alignmentfiles[0], args.o, create_plot=True, verbose=True,strand_specific=False,max_distance=args.max_distance)

# Stranded analysis:
sc_cut_dict_stranded = get_sc_cut_dictionary( args.alignmentfiles,strand_specific=True,filter_function=strict_read_counts_function, regions=regions)
sc_cut_dict_stranded = get_sc_cut_dictionary( args.alignmentfiles,strand_specific=True,filter_function=strict_read_counts_function, regions=regions, n_threads=args.t)
distance_counter_fwd_above, distance_counter_fwd_below, distance_counter_rev_above, distance_counter_rev_below = get_stranded_pairwise_counts(sc_cut_dict_stranded)

# Write tables:
Expand All @@ -477,7 +478,7 @@ def function_to_fit(xdata, period, offset, amplitude, decay, mean ):
#################
# Unstranded density analysis:
prefix_with_bam=False if len(args.alignmentfiles)==1 else True
sc_cut_dict = get_sc_cut_dictionary( args.alignmentfiles,strand_specific=False,filter_function=strict_read_counts_function, prefix_with_bam=prefix_with_bam, regions=regions)
sc_cut_dict = get_sc_cut_dictionary( args.alignmentfiles,strand_specific=False,filter_function=strict_read_counts_function, prefix_with_bam=prefix_with_bam, regions=regions, n_threads=args.t)
cpr = get_r1_counts_per_cell(args.alignmentfiles, prefix_with_bam=prefix_with_bam)
counts = pd.Series(cpr).sort_values()
print(counts)
Expand Down Expand Up @@ -505,7 +506,7 @@ def get_commands(one_contig=None):
# This is a histogram of the amount of observed fragments at distances x:
obs = defaultdict(lambda: np.zeros(n_bins, dtype=np.int64))
total_tests = Counter() # cell -> tests
with Pool() as workers:
with Pool(args.t) as workers:
for cell, cell_obs, n_tests in workers.imap_unordered(
_cuts_to_observation_vector,

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,9 @@ def create_dinuc_plot(s=-250, e=170):
atax.set_ylim(at.mean()-extra, at.mean()+extra)
gcax.set_ylim(gc.mean()-extra, gc.mean()+extra)

for x in range(s,e,10):
atax.axvline(x+5,c='grey',lw=0.5, alpha=0.5)
if e-s < 2000:
for x in range(s,e,10):
atax.axvline(x+5,c='grey',lw=0.5, alpha=0.5)

gcax.set_ylabel('Fraction CC+GG+GC')
atax.set_ylabel('Fraction AA+AT+TA')
Expand Down
39 changes: 34 additions & 5 deletions singlecellmultiomics/bamProcessing/bamExtractSamples.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import collections
from singlecellmultiomics.utils.path import get_valid_filename

def extract_samples( input_bam_path, output_path, capture_samples, head=None ):
def extract_samples( input_bam_path, output_path, capture_samples, head=None, write_group_rg=False, rg_group_prefix=None ):
"""
Extract samples from a bam file
Expand All @@ -23,29 +23,56 @@ def extract_samples( input_bam_path, output_path, capture_samples, head=None ):
"""
bamFile = pysam.AlignmentFile(input_bam_path, "rb")
header = bamFile.header.copy()

# Write to multiple files:
output_handles = {}
print('Groups:')
for group in capture_samples:

header = bamFile.header.copy()

if write_group_rg:

if rg_group_prefix:
rg = rg_group_prefix + group
else:
rg = group
header = header.to_dict()
header['RG'] = [{'ID': rg,
'LB': rg,
'PL': 'ILLUMINA',
'SM': rg,
'PU': '1'}] # reset read group header

output_handles[group] = pysam.AlignmentFile(output_path.replace('.bam',f'{group}.bam'), "wb", header=header)
print(f'\t{group}\t{output_handles[group].filename.decode()}')

sample2handle = {}
sample2group = {}
for group,samples in capture_samples.items():
for sample in samples:
if sample in sample2handle:
raise ValueError(f'The sample {sample} is assigned to more than one group at least:{group} and {sample2group[sample]}')
print(f'\t{sample}\t->{group}')
sample2handle[sample] = output_handles[group]
sample2group[sample] = group

written = 0
for r in bamFile:

sm = r.get_tag('SM')
try:
sample2handle[r.get_tag('SM')].write(r)
handle = sample2handle[sm]
except KeyError:
continue

if write_group_rg:
if rg_group_prefix:
r.set_tag('RG', rg_group_prefix + sample2group[sm] )
else:
r.set_tag('RG', sample2group[sm] )


handle.write(r)
written += 1
if head is not None and written >= head:
break
Expand Down Expand Up @@ -77,6 +104,8 @@ def extract_samples( input_bam_path, output_path, capture_samples, head=None ):
action='store_true',
help='Force overwrite of existing files')
argparser.add_argument('-head', type=int)
argparser.add_argument('--write_group_rg', action='store_true', help='Set the group label as read-group, requires second GROUP column in the input file')
argparser.add_argument('-rg_group_prefix', type=str, help='Prepend this string tot the read group. Requires --write_group_rg ')
args = argparser.parse_args()

if os.path.exists(args.o) and not args.f:
Expand All @@ -103,4 +132,4 @@ def extract_samples( input_bam_path, output_path, capture_samples, head=None ):
raise ValueError("Please supply a file with one or two columns: [SAMPLE], or [SAMPLE]{tab}[GROUP]")
capture_samples[group].add( parts[0] )

extract_samples(args.bamfile, args.o, capture_samples, head=args.head )
extract_samples(args.bamfile, args.o, capture_samples, head=args.head, write_group_rg=args.write_group_rg, rg_group_prefix=args.rg_group_prefix )
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
Expand Down
3 changes: 3 additions & 0 deletions singlecellmultiomics/bamProcessing/bamFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ def get_index_path(bam_path: str):
for p in [bam_path+'.bai', bam_path.replace('.bam','.bai')]:
if os.path.exists(p):
return p
for p in [bam_path+'.csi', bam_path.replace('.bam','.csi')]:
if os.path.exists(p):
return p
return None


Expand Down
7 changes: 5 additions & 2 deletions singlecellmultiomics/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def loadBED(self, path, ignChr=False, parseBlocks=True):
# chr1 14662 187832 calJac3:ENSCJAT00000061222.1-1.1 943 -
# 187832 187832 0 9 5,129,69,110,42,23,133,202,78,
# 0,38,307,1133,1243,1944,1970,172713,173092,
for line in f:
for line_idx,line in enumerate(f):
if line.split()[0] == "track":
continue
blockAvail = False
Expand All @@ -347,8 +347,11 @@ def loadBED(self, path, ignChr=False, parseBlocks=True):
elif len(parts)>=4:
chrom, chromStart, chromEnd, value = parts[:4]
name = value
elif len(parts)==3:
chrom, chromStart, chromEnd = parts[:3]
name = str(line_idx)
else:
raise ValueError('Could not read the supplied bed file, it has too little columns, expecting at least 4 columns: contig,start,end,value')
raise ValueError('Could not read the supplied bed file, it has too little columns, expecting at least 3 columns: contig,start,end[,value]')

chrom = self.remapKeys.get(chrom, chrom)
chrom = chrom if ignChr == False else chrom.replace('chr', '')
Expand Down
1 change: 1 addition & 0 deletions singlecellmultiomics/libraryProcessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .sample_sheet import SampleSheet
100 changes: 100 additions & 0 deletions singlecellmultiomics/libraryProcessing/sample_sheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import json

def mark_autodetect(libname):
lib = libname.lower()
if 'k9me3' in lib:
return 'H3K9me3'
if 'k4me1' in lib:
return 'H3K4me1'
if 'k4me3' in lib:
return 'H3K4me3'
if 'k27me3' in lib:
return 'H3K27me3'
if 'k36me3' in lib:
return 'H3K36me3'

return 'NA'


class SampleSheet:
"""
Class for reading and writing sample sheet information
"""
def __init__(self, path=None):
self.data = {}
if path is not None:
self.load_json(path)

def load_json(self,sample_sheet_location,merge=False):
"""
Load the data from the specified json file.
When merge=True the data will be added to the current data (used for merging sheets)
Duplicates will be overwritten by the last load file
"""
with open(sample_sheet_location) as f:
if merge:
for k,v in json.load(f).items():
if k not in self.data:
self.data[k] = v
else:
self.data[k].update(v)
else:
self.data = json.load(f)
def get(self,key,default):
return self.data.get(key,default)

def __getitem__(self,k):
return self.data[k]

def layout_well2index(self, layout_name):
""" Get the well to index mapping for the supplied layout """
return {k:tuple(v) for k,v in self['well2coord'][self['layout_format'][layout_name]].items()}

def index_to_condition(self, layout_name):
well_to_condition = self['layouts'][layout_name]
return {self['well2index'][self['layout_format'][layout_name]][k]:v for k,v in well_to_condition.items()}


def index2well(self, layout_name: str) -> dict:
"""
Returns a dictionary index->well
{1: 'A1',
2: 'A2',
3: 'A3',
4: 'A4',
5: 'A5',
"""
return {v:k for k,v in self['well2index'][self['layout_format'][layout_name]].items() }

@property
def libraries(self):

libraries = set()
for k in ['libtype','marks']:
libraries.update( set(self.get(k,dict()).keys()))
return libraries


def drop_library(self, library):
if type(library) is list:
for l in library:
self.drop_library(l)
return
for main_key in ['marks','condition','libtype','library_layout']:
if main_key not in self.data:
continue
if library in self[main_key]:
del self[main_key][library]

def drop_mark(self, mark):
""" Remove all libraries with the given mark """
if type(mark) is list:
for m in mark:
self.drop_mark(m)
return
to_prune = []
for sample, _mark in self['marks'].items():
if mark==_mark:
to_prune.append(sample)

self.drop_library(to_prune)
Loading

0 comments on commit 208a718

Please sign in to comment.