Skip to content

Commit

Permalink
Merge pull request #216 from BuysDB/fs_additions
Browse files Browse the repository at this point in the history
Fs additions
  • Loading branch information
BuysDB authored Jan 30, 2021
2 parents 591f6ef + 5186748 commit b8bf8fe
Show file tree
Hide file tree
Showing 13 changed files with 293 additions and 22 deletions.
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
dist: xenial # required for Python >= 3.7
language: python
python:
- "3.6"
- "3.6.1"
- "3.7"
- "3.8"

# command to install dependencies
install:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pysam>=0.15.3
numpy>=1.16.4
numpy>=1.16.5
pandas>=0.25.0
colorama
pysamiterators>=1.8
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@
],

install_requires=[
'pysam>=0.15.3','numpy>=1.16.1','pandas>=0.25.0','colorama','pyBigWig',
'pysam>=0.15.3','numpy>=1.16.5','pandas>=0.25.0','colorama','pyBigWig',
'cutadapt>=2.9',
'pysamiterators>=1.8','more-itertools','matplotlib','tabulate',
'wheel','setuptools>=40.8.0','scikit-learn>=0.21.3','seaborn>=0.11.0', 'statsmodels', 'cached_property',
Expand Down
11 changes: 10 additions & 1 deletion singlecellmultiomics/fragment/chic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,23 @@ class CHICFragment(Fragment):
def __init__(self,
reads,
R1_primer_length=4,
R2_primer_length=6,
R2_primer_length=6, # Is automatically detected now
assignment_radius=0,
umi_hamming_distance=1,
invert_strand=False,
no_umi_cigar_processing=False,
**kwargs
):
self.invert_strand = invert_strand

# Perform random primer autodect,
# When the demux profile is MX:Z:scCHIC384C8U3l, then there is no random primer
for read in reads:
if read is not None and read.has_tag('MX'):
if read.get_tag('MX')[-1]=='l':
R2_primer_length = 0
break

Fragment.__init__(self,
reads,
assignment_radius=assignment_radius,
Expand Down
20 changes: 16 additions & 4 deletions singlecellmultiomics/fragment/fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def __init__(self, reads,
mapping_dir=(False, True),
max_NUC_stretch: int = None,
read_group_format: int = 0, # R1 forward, R2 reverse
library_name: str = None # Overwrites the library name
library_name: str = None, # Overwrites the library name
single_end: bool = False
):
"""
Initialise Fragment
Expand All @@ -74,7 +75,8 @@ def __init__(self, reads,
length of R1 primer, these bases are not taken into account when calculating a consensus
R2_primer_length(int):
length of R2 primer, these bases are not taken into account when calculating a consensus
length of R2 primer, these bases are not taken into account when calculating a consensus, this length is auto-detected/ overwritten when the rS tag is set
tag_definitions(list):
sam TagDefinitions
Expand Down Expand Up @@ -113,6 +115,7 @@ def __init__(self, reads,
self.read_group_format = read_group_format
self.max_NUC_stretch = max_NUC_stretch
self.qcfail = False
self.single_end = single_end

# Span:\
self.span = [None, None, None]
Expand Down Expand Up @@ -348,7 +351,7 @@ def write_tags(self):
if self.has_valid_span():
# Write fragment size:
if self.safe_span:
self.set_meta('fS', self.get_fragment_size())
self.set_meta('fS', self.estimated_length)
self.set_meta('fe', self.span[1])
self.set_meta('fs', self.span[2])
else:
Expand Down Expand Up @@ -405,6 +408,10 @@ def get_random_primer_hash(self):

if R2 is None or R2.query_sequence is None:
return None, None, None

if self.R2_primer_length == 0 and self.random_primer_sequence is None:
return None, None, None

# The read was not mapped
if R2.is_unmapped:
# Guess the orientation does not matter
Expand Down Expand Up @@ -528,8 +535,13 @@ def estimated_length(self) -> int:
Obtain the estimated size of the fragment,
returns None when estimation is not possible
Takes into account removed bases (R2)
Assumes inwards sequencing orientation
Assumes inwards sequencing orientation, except when self.single_end is set
"""
if self.single_end:
if self[0] is None:
return None
return self[0].reference_end - self[0].reference_start


if self.has_R1() and self.has_R2():

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,71 @@ def demultiplex(self, records, **kwargs):
#taggedRecords[0].qualities = taggedRecords[0].qualities[1:]
return taggedRecords

class SCCHIC_384w_c8_u3_direct_ligation(UmiBarcodeDemuxMethod):
def __init__(self, barcodeFileParser, random_primer_read=None,random_primer_length=None, **kwargs):
self.barcodeFileAlias = 'maya_384NLA'
UmiBarcodeDemuxMethod.__init__(
self,
umiRead=0,
umiStart=0,
umiLength=3,
barcodeRead=0,
barcodeStart=3,
barcodeLength=8,
random_primer_read=random_primer_read,
random_primer_length=random_primer_length,
barcodeFileAlias=self.barcodeFileAlias,
barcodeFileParser=barcodeFileParser,
**kwargs)
self.shortName = 'scCHIC384C8U3l'
self.longName = 'Single cell CHIC, 384well CB: 8bp UMI: 3bp, no RP'
self.autoDetectable = False
self.description = '384 well format. 3bp umi followed by 8bp barcode and a single A. R2 does not contain a random primer'

self.sequenceCapture[0] = slice(
self.barcodeLength + self.umiLength + 1,
None) # dont capture the first base

def demultiplex(self, records, **kwargs):

if kwargs.get(
'probe') and records[0].sequence[self.barcodeLength + self.umiLength] != 'T':
raise NonMultiplexable


# add first 2 bases as ligation tag:
ligation_start = self.barcodeLength + self.umiLength
ligation_end = ligation_start + 2
ligation_sequence = records[0].sequence[ligation_start:ligation_end]
ligation_qualities = records[0].qual[ligation_start:ligation_end]

taggedRecords = UmiBarcodeDemuxMethod.demultiplex(
self, records, **kwargs)

taggedRecords[0].addTagByTag(
'lh',
ligation_sequence,
isPhred=False,
make_safe=False)
taggedRecords[0].addTagByTag(
'lq',
ligation_qualities,
isPhred=True,
make_safe=False)
taggedRecords[1].addTagByTag(
'lh',
ligation_sequence,
isPhred=False,
make_safe=False)
taggedRecords[1].addTagByTag(
'lq',
ligation_qualities,
isPhred=True,
make_safe=False)
#taggedRecords[0].sequence = taggedRecords[0].sequence[1:]
#taggedRecords[0].qualities = taggedRecords[0].qualities[1:]
return taggedRecords




Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def __init__(
dm.SCCHIC_384w_c8_u3,
dm.SCCHIC_384w_c8_u3_cs2,
dm.SCCHIC_384w_c8_u3_pdt,
dm.SCCHIC_384w_c8_u3_direct_ligation,

dm.MSPJI_c8_u3,
dm.ScartraceR2,
dm.ScartraceR1,
Expand Down
28 changes: 26 additions & 2 deletions singlecellmultiomics/molecule/molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,7 @@ def write_tags(self):
read.is_duplicate = True

# Write RT reaction tags (rt: rt reaction index, rd rt duplicate index)
# This is only required for fragments which have defined random primers
rt_reaction_index = None
for rt_reaction_index, ( (contig, random_primer_start, random_primer_sequence), frags) in enumerate(
self.get_rt_reactions().items()):
Expand Down Expand Up @@ -1672,6 +1673,22 @@ def aligned_length(self) -> int:
else:
return None

@property
def is_completely_matching(self) -> bool:
"""
Checks if all associated reads are completely mapped:
checks if all cigar operations are M,
Returns True when all cigar operations are M, False otherwise
"""

return all(
(
all(
[ (operation==0)
for operation, amount in read.cigartuples] )
for read in self.iter_reads()))


@property
def estimated_max_length(self) -> int:
"""
Expand Down Expand Up @@ -1777,7 +1794,7 @@ def can_be_yielded(self, chromosome, position):
self.cache_size *
0.5)

def get_rt_reactions(self):
def get_rt_reactions(self) -> dict:
"""Obtain RT reaction dictionary
returns:
Expand Down Expand Up @@ -1824,17 +1841,24 @@ def get_mean_rt_fragment_size(self):
self.get_rt_reaction_fragment_sizes()
)

def write_pysam(self, target_file, consensus=False, no_source_reads=False, consensus_name=None):
def write_pysam(self, target_file, consensus=False, no_source_reads=False, consensus_name=None, consensus_read_callback=None, consensus_read_callback_kwargs=None):
"""Write all associated reads to the target file
Args:
target_file (pysam.AlignmentFile) : Target file
consensus (bool) : write consensus
no_source_reads (bool) : while in consensus mode, don't write original reads
consensus_read_callback (function) : this function is called with every consensus read as an arguments
consensus_read_callback_kwargs (dict) : arguments to pass to the callback function
"""
if consensus:
reads = self.deduplicate_majority(target_file,
f'molecule_{uuid4()}' if consensus_name is None else consensus_name)
if consensus_read_callback is not None:
if consensus_read_callback_kwargs is not None:
consensus_read_callback(reads, **consensus_read_callback_kwargs)
else:
consensus_read_callback(reads)

for read in reads:
target_file.write(read)
Expand Down
3 changes: 2 additions & 1 deletion singlecellmultiomics/molecule/taps.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from singlecellmultiomics.utils.sequtils import complement
from itertools import product
from matplotlib.pyplot import get_cmap
from copy import copy
complement_trans = str.maketrans('ATGC', 'TACG')


Expand Down Expand Up @@ -58,7 +59,7 @@ def __init__(self, reference=None, reference_variants=None, taps_strand='F', **k
self.context_mapping[True][''.join(['C'] + list(x))] = 'H'
self.context_mapping[False][''.join(['C'] + list(x))] = 'h'

self.colormap = get_cmap('RdYlBu_r')
self.colormap = copy(get_cmap('RdYlBu_r')) # Make a copy
self.colormap.set_bad((0,0,0)) # For reads without C's

def position_to_context(
Expand Down
25 changes: 19 additions & 6 deletions singlecellmultiomics/universalBamTagger/tapsTabulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ def finish_bam(output, args, temp_out):
'-bamout',
type=str,
help="optional (tagged) output BAM path")
argparser.add_argument(
'--allow_single_end',
action='store_true',
help='Allow single end reads')

args = argparser.parse_args()
alignments = pysam.AlignmentFile(args.alignmentfile)

Expand Down Expand Up @@ -152,6 +157,11 @@ def finish_bam(output, args, temp_out):
else:
features = None

fragment_class_args={'umi_hamming_distance': 1,
'no_umi_cigar_processing':False}



molecule_class_args = {
'reference': reference,
'taps': taps,
Expand Down Expand Up @@ -180,6 +190,12 @@ def finish_bam(output, args, temp_out):
'min_phred_score':args.min_phred_score
}


if args.allow_single_end:
# Single end base calls are "unsafe", allow them :
molecule_class_args['allow_unsafe_base_calls'] = True
fragment_class_args['single_end'] = True

s = args.moleculeNameSep
try:
for i, molecule in enumerate(singlecellmultiomics.molecule.MoleculeIterator(
Expand All @@ -188,13 +204,10 @@ def finish_bam(output, args, temp_out):
yield_invalid=(output is not None),
every_fragment_as_molecule=args.every_fragment_as_molecule,
fragment_class=fragment_class,
fragment_class_args={'umi_hamming_distance': 1,
'no_umi_cigar_processing':False,

},
fragment_class_args=fragment_class_args,

molecule_class_args=molecule_class_args,
contig=args.contig)):
molecule_class_args=molecule_class_args,
contig=args.contig)):

molecule.set_meta('mi',i)
if args.head and (i - 1) >= args.head:
Expand Down
28 changes: 26 additions & 2 deletions singlecellmultiomics/utils/sequtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,32 @@ def prefetch(self, contig, start, end):



def get_chromosome_number(chrom: str) -> int:
"""
Get chromosome number (index) of the supplied chromosome:
'1' -> 1, chr1 -> 1, returns -1 when not available, chrM -> -1
"""
try:
return int(chrom.replace('chr',''))
except Exception as e:
return -1

def is_autosome(chrom: str) -> bool:
""" Returns True when the chromsome is an autosomal chromsome,
not an alternative allele, mitochrondrial or sex chromosome
Args:
chrom(str) : chromosome name
Returns:
is_main(bool) : True when the chromsome is an autosome
"""
return is_main_chromosome(chrom) and get_chromosome_number(chrom)!=-1



def is_main_chromosome(chrom):
def is_main_chromosome(chrom: str) -> bool:
""" Returns True when the chromsome is a main chromsome,
not an alternative or other
Expand All @@ -42,7 +66,7 @@ def is_main_chromosome(chrom):
return False
return True

def get_contig_list_from_fasta(fasta_path, with_length=False):
def get_contig_list_from_fasta(fasta_path: str, with_length: bool=False) -> list:
"""Obtain list of contigs froma fasta file,
all alternative contigs are pooled into the string MISC_ALT_CONTIGS_SCMO
Expand Down
2 changes: 1 addition & 1 deletion singlecellmultiomics/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.1.17'
__version__ = '0.1.18'
Loading

0 comments on commit b8bf8fe

Please sign in to comment.