Skip to content

Commit

Permalink
Merge pull request #243 from BuysDB/hd_lazy_load_fux
Browse files Browse the repository at this point in the history
Hd lazy load fix
  • Loading branch information
BuysDB authored Aug 10, 2021
2 parents 0107ef3 + 5cc61aa commit 7dee680
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 11 deletions.
3 changes: 2 additions & 1 deletion singlecellmultiomics/bamProcessing/bamFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import time
import contextlib
from shutil import which, move
from singlecellmultiomics.utils import BlockZip, Prefetcher
from singlecellmultiomics.utils import BlockZip, Prefetcher, get_file_type
import uuid
import os
from collections import defaultdict, Counter
Expand All @@ -13,6 +13,7 @@
from typing import Generator
from multiprocessing import Pool


def get_index_path(bam_path: str):
"""
Obtain path to bam index
Expand Down
6 changes: 4 additions & 2 deletions singlecellmultiomics/bamProcessing/bamToBigWig.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import argparse
from singlecellmultiomics.bamProcessing import get_contig_sizes
import numpy as np
from singlecellmultiomics.utils.path import get_valid_filename

def bam_to_wig(bam_paths, write_path, bin_size, method='sum', verbose=False, n_threads=None, sample_mapping_function=None):
if verbose:
Expand Down Expand Up @@ -88,15 +89,16 @@ def bam_to_wig(bam_paths, write_path, bin_size, method='sum', verbose=False, n_t
type=str,
help="""Path to a CSV file which contains for every barcode index (SM tag) to what group it belongs.
The CSV file has no header and two columns, the first column contains the sample name,
the second the target sample name. Multiple barcode indices can share the same sample name, this will create a pseudobulk signal"""
the second the target sample name. Multiple barcode indices can share the same sample name, this will create a pseudobulk signal.
Expects a comma as delimiter."""
)

args = argparser.parse_args()
assert args.o.endswith('.bw')

sample_mapping_function = None
if args.pseudobulk_SM_csv is not None:
sm_sample_map = {str(sm):str(sample)
sm_sample_map = {str(sm):get_valid_filename(str(sample))
for sm, sample in pd.read_csv(args.pseudobulk_SM_csv,header=None,index_col=0).iloc[:,0].to_dict().items() }
def sample_mapping_function(s):
return sm_sample_map.get(s)
Expand Down
7 changes: 3 additions & 4 deletions singlecellmultiomics/barcodeFileParser/barcodeFileParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def parse_pending_barcode_file_of_alias(self, alias):

logging.info(f"Loading promised barcode file {alias}")
self.parse_barcode_file( self.pending_files[alias] )
logging.info(f"Performing hamming extension for {alias}")
self.expand(self.hammingDistanceExpansion, alias=alias)
del self.pending_files[alias]

Expand Down Expand Up @@ -135,10 +136,8 @@ def __init__(
logging.info(f"Parsing {barcodeFile}, alias {barcodeFileAlias}")
self.parse_barcode_file(barcodeFile)

if hammingDistanceExpansion > 0:
for alias in list(self.barcodes.keys()):
# print(alias)
self.expand(hammingDistanceExpansion, alias=alias)
if hammingDistanceExpansion > 0:
self.expand(hammingDistanceExpansion, alias=barcodeFileAlias)

def getTargetCount(self, barcodeFileAlias):
return(len(self.barcodes[barcodeFileAlias]), len(self.extendedBarcodes[barcodeFileAlias]))
Expand Down
2 changes: 1 addition & 1 deletion singlecellmultiomics/modularDemultiplexer/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@
barcodeParser = barcodeFileParser.BarcodeParser(
hammingDistanceExpansion=args.hd,
barcodeDirectory=args.barcodeDir,
lazyLoad=("10x_3M-february-2018.bc.gz",)
lazyLoad=("10x_3M-february-2018",)
)

# Setup the index parser
Expand Down
31 changes: 31 additions & 0 deletions singlecellmultiomics/utils/sequtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from array import array
import os


class Reference(Prefetcher):
""" This is a picklable wrapper to pass reference handles """

Expand All @@ -31,7 +32,37 @@ def invert_strand_f(s):
elif s=='-':
return '+'
return '.'

def get_contig_lengths_from_resource(resource ) -> dict:
"""
Extract contig lengts from the supplied resouce (Fasta file or Bam/Cram/Sam )
Returns:
lengths(dict)
"""
if type(resource) is AlignmentFile:
return dict(zip(resource.references, resource.lengths))

elif type(resource) is str:
est_type = get_file_type(resource)

if est_type in (AlignmentFile,FastaFile):
with est_type(resource) as f:
lens = dict(zip(f.references, f.lengths))

return lens


raise NotImplementedError('Unable to extract contig lengths from this resource')


def get_file_type(s: str):
"""Guess the file type of the input string, returns None when the file type can not be determined"""
if s.endswith('.bam') or s.endswith('.cram') or s.endswith('.sam'):
return AlignmentFile
if s.endswith('.fa') or s.endswith('.fasta') or s.endswith('.fa.gz') or s.endswith('.fasta.gz'):
return FastaFile

return None

def create_fasta_dict_file(refpath: str, skip_if_exists=True):
"""Create index dict file for the reference fasta at refpath
Expand Down
2 changes: 1 addition & 1 deletion singlecellmultiomics/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.1.22'
__version__ = '0.1.23'
39 changes: 37 additions & 2 deletions tests/test_demultiplexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def test_UmiBarcodeDemuxMethod_matching_barcode(self):
self.assertEqual( demultiplexed_record[0].tags['bi'], 1) # 1 from version 0.1.12



def test_CS2_NH_matching_barcode(self):

barcode_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/barcodes/')
Expand Down Expand Up @@ -83,7 +84,7 @@ def test_CHICT(self):

barcode_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/barcodes/')
index_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/indices/')
barcode_parser = BarcodeParser(barcode_folder, lazyLoad='*')
barcode_parser = BarcodeParser(barcode_folder, lazyLoad='*',)
index_parser = BarcodeParser(index_folder, lazyLoad='*')

seq = 'TATGAGCAATCACACACTATAGTCATTCAGGAGCAGGTTCTTCAGGTTCCCTGTAGTTGTGT'
Expand Down Expand Up @@ -133,14 +134,48 @@ def test_CHICT(self):
indexFileParser=index_parser)

demultiplexed_record = demux.demultiplex([r1,r2])
# The barcode sequence is ACACACTA (first barcode)
# The barcode sequence is AGAGCGCG (26th barcode)
self.assertEqual( demultiplexed_record[0].tags['BC'], 'AGAGCGCG')
self.assertEqual( demultiplexed_record[0].tags['bi'], 26)
self.assertEqual( demultiplexed_record[0].tags['MX'], 'scCHIC384C8U3')
self.assertEqual( demultiplexed_record[0].tags['RX'], 'AAA')
self.assertEqual( demultiplexed_record[0].sequence, seq)



## test the hamming distance expansion
R1_seq = f'AAAAGAGCGGGT{seq}'
r1 = FastqRecord(
'@NS500414:628:H7YVNBGXC:1:11101:15963:1046 1:N:0:GTGAAA',
R1_seq,
'+',
'E'*len(R1_seq)
)
r2 = FastqRecord(
'@NS500414:628:H7YVNBGXC:1:11101:15963:1046 2:N:0:GTGAAA',
'ACCCCAGATCAACGTTGGACNTCNNCNTTNTNCTCNGCACCNNNNCNNNCTTATNCNNNANNNNNNNNNNTNNGN',
'+',
'6AAAAEEAEE/AEEEEEEEE#EE##<#6E#A#EEE#EAEEA####A###EE6EE#E###E##########E##A#'
)

barcode_parser = BarcodeParser(barcode_folder, lazyLoad='*',hammingDistanceExpansion=1)

demux = SCCHIC_384w_c8_u3_cs2(
barcodeFileParser=barcode_parser,
indexFileParser=index_parser
)

demultiplexed_record = demux.demultiplex([r1,r2])
# The barcode sequence is AGAGCGCG (26th barcode)
self.assertEqual( demultiplexed_record[0].tags['BC'], 'AGAGCGCG')
self.assertEqual( demultiplexed_record[0].tags['bi'], 26)
self.assertEqual( demultiplexed_record[0].tags['MX'], 'scCHIC384C8U3')
self.assertEqual( demultiplexed_record[0].tags['RX'], 'AAA')
self.assertEqual( demultiplexed_record[0].sequence, seq)




def test_3DEC_UmiBarcodeDemuxMethod_matching_barcode(self):

barcode_folder = pkg_resources.resource_filename('singlecellmultiomics','modularDemultiplexer/barcodes/')
Expand Down

0 comments on commit 7dee680

Please sign in to comment.