Merge pull request #14 from mskcc/feature/vector-gen-no-cython

Feature/vector gen no cython
mskcc · Jul 20, 2020 · bad3910 · bad3910
2 parents c03848f + a1cb2cf
commit bad3910
Show file tree

Hide file tree

Showing 9 changed files with 1,859 additions and 112 deletions.
diff --git a/data/generate_vectors/bam2tensor.py b/data/generate_vectors/bam2tensor.py
@@ -3,7 +3,7 @@
 
 @author: John Ziegler
 Memorial Sloan Kettering Cancer Center 
-Nov. 2018
+May 2020 
 
 zieglerj@mskcc.org
 
@@ -15,76 +15,69 @@
 
 import pysam
 import numpy as np
-from collections import deque
 import traceback
 
+
 class Bam2Tensor(object):
-    chromosome = '1'
-    start = 1
-    end = 100
 
 
-    def __init__(self, bam_file, normal_bam_file, chromosome, start, end, coverage=50):
+    def __init__(self, bam_file, normal_bam_file, coverage=50):
         self.bam_file = bam_file
         self.normal_bam_file = normal_bam_file
-        self.chromosome = chromosome
-        self.start = start
-        self.end = end
         self.coverage = coverage
 
-    def createIndividualBamTensor(self, read_iterator, cols):
-        cols = self.end - self.start
-        result_array = np.zeros((0, cols, 3))
-        required_coverage = int(self.coverage)
 
+    def create_cigar_array(self, cigar_tuples):
+        if len(cigar_tuples) == 1:
+            return np.array([cigar_tuples[0][0]]*cigar_tuples[0][1])
+
+        expanded = [[pair[0]]*pair[1] for pair in cigar_tuples]
+        return np.concatenate(expanded).flat
+
+
+
+    def createIndividualBamTensor(self, read_iterator, start, end, cols):
+        required_coverage = int(self.coverage)
+        results = list()
         row_counter = 0
+
         try:
             for read in read_iterator:
                 #skip the read if it doesn't totally overlap the region
-                if(read.reference_start > self.start or read.reference_end < self.end):
+
+                if read.reference_end is None:
+                    continue
+
+                read_start = read.reference_start
+                read_end = read.reference_end
+                is_reverse = int(read.is_reverse)
+                mapping_q = read.mapping_quality
+
+                if(read_start > start or read_end < end):
                     continue
 
                 # get the cigar tuples and covert to a list of cigar values
                 cigar_tuples = read.cigartuples
-                cigar_vals_for_read = []
                 if cigar_tuples is None:
                     continue
-
-                for cigar_tuple in cigar_tuples:
-                    temp = [cigar_tuple[0]]*cigar_tuple[1]
-                    cigar_vals_for_read += temp
-
 
-                cigar_vals_for_read = np.array(cigar_vals_for_read)
+
+                cigar_vals_for_read = self.create_cigar_array(cigar_tuples)
 
                 # pull out the cigar values for the region we're interested in
                 # for example, if this read aligns to positions 100 - 110, and our region
-                # of interest(roi) is 105 - 109 we need to get cigar_vals_for_read[5:9]... to get
-                # the start and end index (5 and 9 in our example) we do:
-                # start = max(0, roi_start - read_start)
-                # end = len(civar_vals_for_read) - 1 - max(0, read_end - roi_end)
-                read_start = read.reference_start
-                read_end = read.reference_end
-                cigar_start = max(0, self.start - read_start)
-                cigar_end = len(cigar_vals_for_read) - max(0, read_end - self.end)
-
-
-                ref_locations = list(range(self.start, self.end))
-                aligned_pairs = read.get_aligned_pairs()
-                aligned_cigar_pairs = [x for x in aligned_pairs if x[1] in ref_locations]
-                aligned_cigar_indicies = [x[0] for x in aligned_cigar_pairs]
-                aligned_cigar_indicies = [-1 if x is None else x for x in aligned_cigar_indicies]
-
-                aligned_cigar_indicies = np.array(aligned_cigar_indicies)
-                cigar_vals_to_fill = cigar_vals_for_read[aligned_cigar_indicies]
+                # of interest(roi) is 105 - 109 we need to get cigar_vals_for_read[5:9]... =
+                cigar_start = start - read_start
+                cigar_end = cigar_start + cols
 
+                cigar_vals_to_fill = cigar_vals_for_read[cigar_start:cigar_end]
 
                 # fill and add new row
                 new_row = np.zeros((1, cols, 3))
-                for i in range(0, len(cigar_vals_to_fill)-1):
-                    new_row[0,i,:] = [cigar_vals_to_fill[i], read.mapping_quality, int(read.is_reverse)]
-
-                result_array = np.vstack((result_array, new_row))
+                for i in range(0, len(cigar_vals_to_fill)):
+                    new_row[0,i,:] = [cigar_vals_to_fill[i], mapping_q, is_reverse]
+                
+                results.append(new_row)
                 row_counter += 1
 
         except Exception as e:
@@ -95,19 +88,24 @@ def createIndividualBamTensor(self, read_iterator, cols):
         if row_counter < required_coverage:
             return None
 
-        return result_array
-
-    def createTensor(self):
+        result_array = np.vstack(results)
 
-        read_iterator = self.bam_file.fetch(self.chromosome, self.start, self.end)
-        cols = self.end - self.start
-        norm_read_iterator = self.normal_bam_file.fetch(self.chromosome, self.start, self.end)
-
-        tumor_result = self.createIndividualBamTensor(read_iterator, cols)
-        normal_result = self.createIndividualBamTensor(norm_read_iterator, cols)
-
-        if tumor_result is None or normal_result is None:
-            return None
+        return result_array
 
-        return (tumor_result, normal_result)
+    def createTensor(self, chromosome, start, end):
+        cols = 0
+        tumor_result = None
+        normal_result = None
 
+        try:
+            read_iterator = self.bam_file.fetch(chromosome, start, end)
+            norm_read_iterator = self.normal_bam_file.fetch(chromosome, start, end)
+            cols = end - start
+            tumor_result = self.createIndividualBamTensor(read_iterator, start, end, cols)
+            normal_result = self.createIndividualBamTensor(norm_read_iterator, start, end, cols)
+
+            if tumor_result is None or normal_result is None:
+                return None
+        except Exception as e:
+            print(e)
+        return (tumor_result, normal_result)
diff --git a/data/generate_vectors/create_data.py b/data/generate_vectors/create_data.py
@@ -1,20 +1,17 @@
 """
 MiMSI Vector Generation Utility
-
 Used to create vectors utilized by the MiMSI model during training and evaluation of MSI status in 
 Next-Gen Sequencing Results. Reads a list of microsatellite regions provided via command line argument
 and creates an instance vector for every region from a tumor/normal pair of bam files.
 
 @author: John Ziegler
 Memorial Sloan Kettering Cancer Center 
-Nov. 2018
+May 2020
 
 zieglerj@mskcc.org
-
 (c) 2018 Memorial Sloan Kettering Cancer Center.  This program is free software: you may use, redistribute, 
 and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, 
 either version 3 or later. See the LICENSE file for details
-
 """
 
 import argparse
@@ -36,7 +33,7 @@
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
-def process(line, bamfile, normalbamfile, covg):
+def process(line, bam2tensor):
     """
     Process is the most granular conversion function. It converts a single 
     microsatellite into a 100 x L x 3 vector, where 100 is the downsampled
@@ -63,10 +60,8 @@ def process(line, bamfile, normalbamfile, covg):
     total_len = end - int(start)
     if total_len < 5 or total_len > 40:
         return (None, None)
-    tumor_test = Bam2Tensor(
-        bamfile, normalbamfile, str(chrom), int(start), int(end), covg
-    )
-    return (tumor_test.createTensor(), [str(chrom), int(start), int(end)])
+
+    return (bam2tensor.createTensor(str(chrom), int(start), int(end)), [str(chrom), int(start), int(end)])
 
 
 def process_wrapper(bam_filename, norm_filename, m_list, chunkStart, chunkSize, covg):
@@ -90,17 +85,18 @@ def process_wrapper(bam_filename, norm_filename, m_list, chunkStart, chunkSize,
 
         bamfile = pysam.AlignmentFile(bam_filename, "rb")
         normalbamfile = pysam.AlignmentFile(norm_filename, "rb")
+        bam2tensor = Bam2Tensor(bamfile, normalbamfile, covg)
 
         for line in lines:
-            result, loc = process(line, bamfile, normalbamfile, covg)
+            result, loc = process(line, bam2tensor)
             if result is not None:
                 # We got one!!!
                 results.append(result)
                 locations.append(loc)
         return (results, locations)
 
 
-def chunkify(fname, size=10 * 1024 * 1024):
+def chunkify(fname, size):
     """
     Generic helper method to break up a file into many smaller chunks,
     Here we use it to break up the microsatellites list so that we can
@@ -124,7 +120,6 @@ def convert_bam(bamfile, norm_filename, m_list, covg, cores):
     This is the top level function that converts an entire tumor/normal pair of bam files 
     into a vector collection that MiMSI will utilize in subsequent steps. It is setup to run
     in a parallel processing environment, with cores specified as a command line arg in main
-
     The steps it goes through are as follows: 
         1. chunk the list of microsatellites were interested in so that they can be executed 
             in parallel
@@ -138,10 +133,11 @@ def convert_bam(bamfile, norm_filename, m_list, covg, cores):
 
     pool = mp.Pool(int(cores))
     jobs = []
-
+    file_size = os.path.getsize(m_list)
+    chunk_size = int( file_size / (cores ) )
     try:
         # create jobs
-        for chunkStart, chunkSize in chunkify(m_list):
+        for chunkStart, chunkSize in chunkify(m_list, chunk_size):
             jobs.append(
                 pool.apply_async(
                     process_wrapper,
@@ -169,14 +165,11 @@ def save_bag(tumor_sample, label, data, locations, save_loc, normal_sample=None)
     """ 
     Save the final collection of microsatellite instances to disk to be used
     in later stages of MiMSI. Saves each sample in the following manner:
-
         {sample}_{label}_data.npy
         {sample}_{label}_locations.npy
-
     This final filename format is important for later stages of the pipeline,
     as the data loader will parse the underscore deliminated filename to determine
     the sample id and the label. Sample id does need to be unique.
-
     """
     # if no instances that met the coverage threshold were found, return
     if len(data) == 0:
@@ -378,8 +371,8 @@ def main():
     # Common arguments
     parser.add_argument(
         "--microsatellites-list",
-        default=ROOT_DIR + "/../../utils/microsatellites.list",
-        help="The list of microsatellites to check in the tumor/normal pair (default: utils/microsatellites.list)",
+        default=ROOT_DIR + "/../../tests/microsatellites_impact_only.list",
+        help="The list of microsatellites to check in the tumor/normal pair (default: tests/microsatellites_impact_only.list)",
     )
     parser.add_argument(
         "--save-location",
@@ -429,3 +422,4 @@ def main():
 
 if __name__ == "__main__":
     main()
+