From e9e1e6b49e81d5c44b221dc3ec154ebf1134e155 Mon Sep 17 00:00:00 2001 From: Tobias Krause Date: Tue, 22 Apr 2025 10:11:48 -0400 Subject: [PATCH 1/6] Enforce cell order to comply with pandas > 1.5 --- cite_seq_count/__main__.py | 12 ++++++++---- cite_seq_count/io.py | 8 ++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py index bf1c3ba..52b93c1 100755 --- a/cite_seq_count/__main__.py +++ b/cite_seq_count/__main__.py @@ -571,7 +571,7 @@ def main(): else: # Select top cells based on total umis per cell top_cells_tuple = umis_per_cell.most_common(args.expected_cells) - top_cells = set([pair[0] for pair in top_cells_tuple]) + top_cells_set = set([pair[0] for pair in top_cells_tuple]) # UMI correction @@ -581,16 +581,20 @@ def main(): aberrant_cells = set() else: # Correct UMIS - (final_results, umis_corrected, aberrant_cells) = processing.correct_umis( + (final_results, umis_corrected, aberrant_cells_set) = processing.correct_umis( final_results=final_results, collapsing_threshold=args.umi_threshold, - top_cells=top_cells, + top_cells=top_cells_set, max_umis=20000, ) # Remove aberrant cells from the top cells for cell_barcode in aberrant_cells: - top_cells.remove(cell_barcode) + top_cells_set.remove(cell_barcode) + + # Ensure cell order (required for pandas>=2.0.0) + top_cells = list(top_cells_set) + aberrant_cells = list(aberrant_cells_set) # Create sparse aberrant cells matrix (umi_aberrant_matrix, read_aberrant_matrix) = processing.generate_sparse_matrices( diff --git a/cite_seq_count/io.py b/cite_seq_count/io.py index 2dc04f0..c13bd88 100644 --- a/cite_seq_count/io.py +++ b/cite_seq_count/io.py @@ -12,7 +12,7 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol Args: sparse_matrix (dok_matrix): Results in a sparse matrix. - top_cells (set): Set of cells that are selected for output. + top_cells (list): Set of cells that are selected for output. ordered_tags_map (dict): Tags in order with indexes as values. data_type (string): A string definning if the data is umi or read based. outfolder (string): Path to the output folder. @@ -35,11 +35,11 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol def write_dense(sparse_matrix, index, columns, outfolder, filename): """ Writes a dense matrix in a csv format - + Args: sparse_matrix (dok_matrix): Results in a sparse matrix. index (list): List of TAGS - columns (set): List of cells + columns (list): List of cells outfolder (str): Output folder filename (str): Filename """ @@ -59,7 +59,7 @@ def write_unmapped(merged_no_match, top_unknowns, outfolder, filename): outfolder (string): Path of the output folder filename (string): Name of the output file """ - + top_unmapped = merged_no_match.most_common(top_unknowns) with open(os.path.join(outfolder, filename),'w') as unknown_file: From e5fbcd254ab4eb4725658ca699dcd598b0fff866 Mon Sep 17 00:00:00 2001 From: Tobias Krause Date: Tue, 22 Apr 2025 10:12:19 -0400 Subject: [PATCH 2/6] Update related docstrings and package requirements to suport tests on python >3.9 --- cite_seq_count/processing.py | 15 +++++++-------- setup.py | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py index 57b35b7..d498b40 100644 --- a/cite_seq_count/processing.py +++ b/cite_seq_count/processing.py @@ -105,7 +105,7 @@ def map_reads( Args: read1_path (string): Path to R1.fastq.gz read2_path (string): Path to R2.fastq.gz - chunk_size (int): The number of lines to process + chunk_size (int): The number of lines to process tags (dict): A dictionary with the TAGs + TAG Names. barcode_slice (slice): A slice for extracting the Barcode portion from the sequence. @@ -234,13 +234,13 @@ def merge_results(parallel_results): def correct_umis(final_results, collapsing_threshold, top_cells, max_umis): """ Corrects umi barcodes within same cell/tag groups. - + Args: final_results (dict): Dict of dict of Counters with mapping results. collapsing_threshold (int): Max distance between umis. top_cells (set): Set of cells to go through. max_umis (int): Maximum UMIs to consider for one cluster. - + Returns: final_results (dict): Same as input but with corrected umis. corrected_umis (int): How many umis have been corrected. @@ -339,14 +339,14 @@ def correct_cells( ): """ Corrects cell barcodes. - + Args: final_results (dict): Dict of dict of Counters with mapping results. umis_per_cell (Counter): Counter of number of umis per cell. collapsing_threshold (int): Max distance between umis. expected_cells (int): Number of expected cells. ab_map (dict): Dict of the TAGS. - + Returns: final_results (dict): Same as input but with corrected umis. umis_per_cell (Counter): Counter of umis per cell after cell barcode correction @@ -375,7 +375,7 @@ def correct_cells_whitelist( ): """ Corrects cell barcodes. - + Args: final_results (dict): Dict of dict of Counters with mapping results. umis_per_cell (Counter): Counter of UMIs per cell. @@ -383,7 +383,7 @@ def correct_cells_whitelist( collapsing_threshold (int): Max distance between umis. ab_map (OrederedDict): Tags in an ordered dict. - + Returns: final_results (dict): Same as input but with corrected umis. umis_per_cell (Counter): Updated UMI counts after correction. @@ -479,4 +479,3 @@ def generate_sparse_matrices(final_results, ordered_tags_map, top_cells): final_results[cell_barcode][TAG].values() ) return (umi_results_matrix, read_results_matrix) - diff --git a/setup.py b/setup.py index 6423bc3..bfb38e8 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ "scipy>=1.1.0", "multiprocess>=0.70.6.1", "umi_tools==1.0.0", - "pytest==4.1.0", + "pytest>=8.1.0", "pytest-dependency==0.4.0", "pandas>=0.23.4", "pybktree==1.1", From 059e2d0695d50e9551dbfed54ace502f8095965d Mon Sep 17 00:00:00 2001 From: Tobias Krause Date: Tue, 22 Apr 2025 10:22:36 -0400 Subject: [PATCH 3/6] version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bfb38e8..1fa4278 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="CITE-seq-Count", - version="1.4.5", + version="1.4.5.pr.197", author="Roelli Patrick", author_email="patrick.roelli@gmail.com", description="A python package to map reads from CITE-seq or hashing data for single cell experiments", From 6fae16d331b958fff133d71e47ff2139f43989c4 Mon Sep 17 00:00:00 2001 From: Tobias Krause Date: Tue, 22 Apr 2025 11:50:55 -0400 Subject: [PATCH 4/6] Bugfix to support custom whitelist with new top_cells_set --- cite_seq_count/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py index 52b93c1..1589b8f 100755 --- a/cite_seq_count/__main__.py +++ b/cite_seq_count/__main__.py @@ -558,7 +558,7 @@ def main(): # If given, use whitelist for top cells if whitelist: - top_cells = whitelist + top_cells_set = whitelist # Add potential missing cell barcodes. for missing_cell in whitelist: if missing_cell in final_results: @@ -567,7 +567,7 @@ def main(): final_results[missing_cell] = dict() for TAG in ordered_tags_map: final_results[missing_cell][TAG] = Counter() - top_cells.add(missing_cell) + top_cells_set.add(missing_cell) else: # Select top cells based on total umis per cell top_cells_tuple = umis_per_cell.most_common(args.expected_cells) From 489f831fb420b5cc904cbf844e87c9f0b73a511b Mon Sep 17 00:00:00 2001 From: Tobias Krause Date: Tue, 22 Apr 2025 11:51:19 -0400 Subject: [PATCH 5/6] Update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1fa4278..bfb38e8 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="CITE-seq-Count", - version="1.4.5.pr.197", + version="1.4.5", author="Roelli Patrick", author_email="patrick.roelli@gmail.com", description="A python package to map reads from CITE-seq or hashing data for single cell experiments", From f1be95cf1192d99fdb8a2ad68240af44ab1ecfa8 Mon Sep 17 00:00:00 2001 From: Tobias Krause Date: Tue, 22 Apr 2025 13:26:19 -0400 Subject: [PATCH 6/6] Debug variable assignment --- cite_seq_count/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py index 1589b8f..274ff9e 100755 --- a/cite_seq_count/__main__.py +++ b/cite_seq_count/__main__.py @@ -578,7 +578,7 @@ def main(): if args.no_umi_correction: # Don't correct umis_corrected = 0 - aberrant_cells = set() + aberrant_cells_set = set() else: # Correct UMIS (final_results, umis_corrected, aberrant_cells_set) = processing.correct_umis( @@ -589,7 +589,7 @@ def main(): ) # Remove aberrant cells from the top cells - for cell_barcode in aberrant_cells: + for cell_barcode in aberrant_cells_set: top_cells_set.remove(cell_barcode) # Ensure cell order (required for pandas>=2.0.0)