Skip to content

Commit

Permalink
Merge pull request #14 from mskcc/feature/vector-gen-no-cython
Browse files Browse the repository at this point in the history
Feature/vector gen no cython
  • Loading branch information
john-ziegler authored Jul 20, 2020
2 parents c03848f + a1cb2cf commit bad3910
Show file tree
Hide file tree
Showing 9 changed files with 1,859 additions and 112 deletions.
110 changes: 54 additions & 56 deletions data/generate_vectors/bam2tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
@author: John Ziegler
Memorial Sloan Kettering Cancer Center
Nov. 2018
May 2020
zieglerj@mskcc.org
Expand All @@ -15,76 +15,69 @@

import pysam
import numpy as np
from collections import deque
import traceback


class Bam2Tensor(object):
chromosome = '1'
start = 1
end = 100


def __init__(self, bam_file, normal_bam_file, chromosome, start, end, coverage=50):
def __init__(self, bam_file, normal_bam_file, coverage=50):
self.bam_file = bam_file
self.normal_bam_file = normal_bam_file
self.chromosome = chromosome
self.start = start
self.end = end
self.coverage = coverage

def createIndividualBamTensor(self, read_iterator, cols):
cols = self.end - self.start
result_array = np.zeros((0, cols, 3))
required_coverage = int(self.coverage)

def create_cigar_array(self, cigar_tuples):
if len(cigar_tuples) == 1:
return np.array([cigar_tuples[0][0]]*cigar_tuples[0][1])

expanded = [[pair[0]]*pair[1] for pair in cigar_tuples]
return np.concatenate(expanded).flat



def createIndividualBamTensor(self, read_iterator, start, end, cols):
required_coverage = int(self.coverage)
results = list()
row_counter = 0

try:
for read in read_iterator:
#skip the read if it doesn't totally overlap the region
if(read.reference_start > self.start or read.reference_end < self.end):

if read.reference_end is None:
continue

read_start = read.reference_start
read_end = read.reference_end
is_reverse = int(read.is_reverse)
mapping_q = read.mapping_quality

if(read_start > start or read_end < end):
continue

# get the cigar tuples and covert to a list of cigar values
cigar_tuples = read.cigartuples
cigar_vals_for_read = []
if cigar_tuples is None:
continue

for cigar_tuple in cigar_tuples:
temp = [cigar_tuple[0]]*cigar_tuple[1]
cigar_vals_for_read += temp


cigar_vals_for_read = np.array(cigar_vals_for_read)

cigar_vals_for_read = self.create_cigar_array(cigar_tuples)

# pull out the cigar values for the region we're interested in
# for example, if this read aligns to positions 100 - 110, and our region
# of interest(roi) is 105 - 109 we need to get cigar_vals_for_read[5:9]... to get
# the start and end index (5 and 9 in our example) we do:
# start = max(0, roi_start - read_start)
# end = len(civar_vals_for_read) - 1 - max(0, read_end - roi_end)
read_start = read.reference_start
read_end = read.reference_end
cigar_start = max(0, self.start - read_start)
cigar_end = len(cigar_vals_for_read) - max(0, read_end - self.end)


ref_locations = list(range(self.start, self.end))
aligned_pairs = read.get_aligned_pairs()
aligned_cigar_pairs = [x for x in aligned_pairs if x[1] in ref_locations]
aligned_cigar_indicies = [x[0] for x in aligned_cigar_pairs]
aligned_cigar_indicies = [-1 if x is None else x for x in aligned_cigar_indicies]

aligned_cigar_indicies = np.array(aligned_cigar_indicies)
cigar_vals_to_fill = cigar_vals_for_read[aligned_cigar_indicies]
# of interest(roi) is 105 - 109 we need to get cigar_vals_for_read[5:9]... =
cigar_start = start - read_start
cigar_end = cigar_start + cols

cigar_vals_to_fill = cigar_vals_for_read[cigar_start:cigar_end]

# fill and add new row
new_row = np.zeros((1, cols, 3))
for i in range(0, len(cigar_vals_to_fill)-1):
new_row[0,i,:] = [cigar_vals_to_fill[i], read.mapping_quality, int(read.is_reverse)]

result_array = np.vstack((result_array, new_row))
for i in range(0, len(cigar_vals_to_fill)):
new_row[0,i,:] = [cigar_vals_to_fill[i], mapping_q, is_reverse]
results.append(new_row)
row_counter += 1

except Exception as e:
Expand All @@ -95,19 +88,24 @@ def createIndividualBamTensor(self, read_iterator, cols):
if row_counter < required_coverage:
return None

return result_array

def createTensor(self):
result_array = np.vstack(results)

read_iterator = self.bam_file.fetch(self.chromosome, self.start, self.end)
cols = self.end - self.start
norm_read_iterator = self.normal_bam_file.fetch(self.chromosome, self.start, self.end)

tumor_result = self.createIndividualBamTensor(read_iterator, cols)
normal_result = self.createIndividualBamTensor(norm_read_iterator, cols)

if tumor_result is None or normal_result is None:
return None
return result_array

return (tumor_result, normal_result)
def createTensor(self, chromosome, start, end):
cols = 0
tumor_result = None
normal_result = None

try:
read_iterator = self.bam_file.fetch(chromosome, start, end)
norm_read_iterator = self.normal_bam_file.fetch(chromosome, start, end)
cols = end - start
tumor_result = self.createIndividualBamTensor(read_iterator, start, end, cols)
normal_result = self.createIndividualBamTensor(norm_read_iterator, start, end, cols)

if tumor_result is None or normal_result is None:
return None
except Exception as e:
print(e)
return (tumor_result, normal_result)
32 changes: 13 additions & 19 deletions data/generate_vectors/create_data.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
"""
MiMSI Vector Generation Utility
Used to create vectors utilized by the MiMSI model during training and evaluation of MSI status in
Next-Gen Sequencing Results. Reads a list of microsatellite regions provided via command line argument
and creates an instance vector for every region from a tumor/normal pair of bam files.
@author: John Ziegler
Memorial Sloan Kettering Cancer Center
Nov. 2018
May 2020
zieglerj@mskcc.org
(c) 2018 Memorial Sloan Kettering Cancer Center. This program is free software: you may use, redistribute,
and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
either version 3 or later. See the LICENSE file for details
"""

import argparse
Expand All @@ -36,7 +33,7 @@
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))


def process(line, bamfile, normalbamfile, covg):
def process(line, bam2tensor):
"""
Process is the most granular conversion function. It converts a single
microsatellite into a 100 x L x 3 vector, where 100 is the downsampled
Expand All @@ -63,10 +60,8 @@ def process(line, bamfile, normalbamfile, covg):
total_len = end - int(start)
if total_len < 5 or total_len > 40:
return (None, None)
tumor_test = Bam2Tensor(
bamfile, normalbamfile, str(chrom), int(start), int(end), covg
)
return (tumor_test.createTensor(), [str(chrom), int(start), int(end)])

return (bam2tensor.createTensor(str(chrom), int(start), int(end)), [str(chrom), int(start), int(end)])


def process_wrapper(bam_filename, norm_filename, m_list, chunkStart, chunkSize, covg):
Expand All @@ -90,17 +85,18 @@ def process_wrapper(bam_filename, norm_filename, m_list, chunkStart, chunkSize,

bamfile = pysam.AlignmentFile(bam_filename, "rb")
normalbamfile = pysam.AlignmentFile(norm_filename, "rb")
bam2tensor = Bam2Tensor(bamfile, normalbamfile, covg)

for line in lines:
result, loc = process(line, bamfile, normalbamfile, covg)
result, loc = process(line, bam2tensor)
if result is not None:
# We got one!!!
results.append(result)
locations.append(loc)
return (results, locations)


def chunkify(fname, size=10 * 1024 * 1024):
def chunkify(fname, size):
"""
Generic helper method to break up a file into many smaller chunks,
Here we use it to break up the microsatellites list so that we can
Expand All @@ -124,7 +120,6 @@ def convert_bam(bamfile, norm_filename, m_list, covg, cores):
This is the top level function that converts an entire tumor/normal pair of bam files
into a vector collection that MiMSI will utilize in subsequent steps. It is setup to run
in a parallel processing environment, with cores specified as a command line arg in main
The steps it goes through are as follows:
1. chunk the list of microsatellites were interested in so that they can be executed
in parallel
Expand All @@ -138,10 +133,11 @@ def convert_bam(bamfile, norm_filename, m_list, covg, cores):

pool = mp.Pool(int(cores))
jobs = []

file_size = os.path.getsize(m_list)
chunk_size = int( file_size / (cores ) )
try:
# create jobs
for chunkStart, chunkSize in chunkify(m_list):
for chunkStart, chunkSize in chunkify(m_list, chunk_size):
jobs.append(
pool.apply_async(
process_wrapper,
Expand Down Expand Up @@ -169,14 +165,11 @@ def save_bag(tumor_sample, label, data, locations, save_loc, normal_sample=None)
"""
Save the final collection of microsatellite instances to disk to be used
in later stages of MiMSI. Saves each sample in the following manner:
{sample}_{label}_data.npy
{sample}_{label}_locations.npy
This final filename format is important for later stages of the pipeline,
as the data loader will parse the underscore deliminated filename to determine
the sample id and the label. Sample id does need to be unique.
"""
# if no instances that met the coverage threshold were found, return
if len(data) == 0:
Expand Down Expand Up @@ -378,8 +371,8 @@ def main():
# Common arguments
parser.add_argument(
"--microsatellites-list",
default=ROOT_DIR + "/../../utils/microsatellites.list",
help="The list of microsatellites to check in the tumor/normal pair (default: utils/microsatellites.list)",
default=ROOT_DIR + "/../../tests/microsatellites_impact_only.list",
help="The list of microsatellites to check in the tumor/normal pair (default: tests/microsatellites_impact_only.list)",
)
parser.add_argument(
"--save-location",
Expand Down Expand Up @@ -429,3 +422,4 @@ def main():

if __name__ == "__main__":
main()

Loading

0 comments on commit bad3910

Please sign in to comment.