Hoohm · Hoohm · Apr 23, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
@@ -558,7 +558,7 @@ def main():
 
     # If given, use whitelist for top cells
     if whitelist:
-        top_cells = whitelist
+        top_cells_set = whitelist
         # Add potential missing cell barcodes.
         for missing_cell in whitelist:
             if missing_cell in final_results:
@@ -567,30 +567,34 @@ def main():
                 final_results[missing_cell] = dict()
                 for TAG in ordered_tags_map:
                     final_results[missing_cell][TAG] = Counter()
-                top_cells.add(missing_cell)
+                top_cells_set.add(missing_cell)
     else:
         # Select top cells based on total umis per cell
         top_cells_tuple = umis_per_cell.most_common(args.expected_cells)
-        top_cells = set([pair[0] for pair in top_cells_tuple])
+        top_cells_set = set([pair[0] for pair in top_cells_tuple])
 
     # UMI correction
 
     if args.no_umi_correction:
         # Don't correct
         umis_corrected = 0
-        aberrant_cells = set()
+        aberrant_cells_set = set()
     else:
         # Correct UMIS
-        (final_results, umis_corrected, aberrant_cells) = processing.correct_umis(
+        (final_results, umis_corrected, aberrant_cells_set) = processing.correct_umis(
             final_results=final_results,
             collapsing_threshold=args.umi_threshold,
-            top_cells=top_cells,
+            top_cells=top_cells_set,
             max_umis=20000,
         )
 
     # Remove aberrant cells from the top cells
-    for cell_barcode in aberrant_cells:
-        top_cells.remove(cell_barcode)
+    for cell_barcode in aberrant_cells_set:
+        top_cells_set.remove(cell_barcode)
+
+    # Ensure cell order (required for pandas>=2.0.0)
+    top_cells = list(top_cells_set)
+    aberrant_cells = list(aberrant_cells_set)
 
     # Create sparse aberrant cells matrix
     (umi_aberrant_matrix, read_aberrant_matrix) = processing.generate_sparse_matrices(

diff --git a/cite_seq_count/io.py b/cite_seq_count/io.py
@@ -12,7 +12,7 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol
 
     Args:
         sparse_matrix (dok_matrix): Results in a sparse matrix.
-        top_cells (set): Set of cells that are selected for output.
+        top_cells (list): Set of cells that are selected for output.
         ordered_tags_map (dict): Tags in order with indexes as values.
         data_type (string): A string definning if the data is umi or read based.
         outfolder (string): Path to the output folder.
@@ -35,11 +35,11 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol
 def write_dense(sparse_matrix, index, columns, outfolder, filename):
     """
     Writes a dense matrix in a csv format
-    
+
     Args:
        sparse_matrix (dok_matrix): Results in a sparse matrix.
        index (list): List of TAGS
-       columns (set): List of cells
+       columns (list): List of cells
        outfolder (str): Output folder
        filename (str): Filename
     """
@@ -59,7 +59,7 @@ def write_unmapped(merged_no_match, top_unknowns, outfolder, filename):
         outfolder (string): Path of the output folder
         filename (string): Name of the output file
     """
-    
+
     top_unmapped = merged_no_match.most_common(top_unknowns)
 
     with open(os.path.join(outfolder, filename),'w') as unknown_file:

diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py
@@ -105,7 +105,7 @@ def map_reads(
     Args:
         read1_path (string): Path to R1.fastq.gz
         read2_path (string): Path to R2.fastq.gz
-        chunk_size (int): The number of lines to process 
+        chunk_size (int): The number of lines to process
         tags (dict): A dictionary with the TAGs + TAG Names.
         barcode_slice (slice): A slice for extracting the Barcode portion from the
             sequence.
@@ -234,13 +234,13 @@ def merge_results(parallel_results):
 def correct_umis(final_results, collapsing_threshold, top_cells, max_umis):
     """
     Corrects umi barcodes within same cell/tag groups.
-    
+
     Args:
         final_results (dict): Dict of dict of Counters with mapping results.
         collapsing_threshold (int): Max distance between umis.
         top_cells (set): Set of cells to go through.
         max_umis (int): Maximum UMIs to consider for one cluster.
-    
+
     Returns:
         final_results (dict): Same as input but with corrected umis.
         corrected_umis (int): How many umis have been corrected.
@@ -339,14 +339,14 @@ def correct_cells(
 ):
     """
     Corrects cell barcodes.
-    
+
     Args:
         final_results (dict): Dict of dict of Counters with mapping results.
         umis_per_cell (Counter): Counter of number of umis per cell.
         collapsing_threshold (int): Max distance between umis.
         expected_cells (int): Number of expected cells.
         ab_map (dict): Dict of the TAGS.
-    
+
     Returns:
         final_results (dict): Same as input but with corrected umis.
         umis_per_cell (Counter): Counter of umis per cell after cell barcode correction
@@ -375,15 +375,15 @@ def correct_cells_whitelist(
 ):
     """
     Corrects cell barcodes.
-    
+
     Args:
         final_results (dict): Dict of dict of Counters with mapping results.
         umis_per_cell (Counter): Counter of UMIs per cell.
         whitelist (set): The whitelist reference given by the user.
         collapsing_threshold (int): Max distance between umis.
         ab_map (OrederedDict): Tags in an ordered dict.
 
-    
+
     Returns:
         final_results (dict): Same as input but with corrected umis.
         umis_per_cell (Counter): Updated UMI counts after correction.
@@ -479,4 +479,3 @@ def generate_sparse_matrices(final_results, ordered_tags_map, top_cells):
                     final_results[cell_barcode][TAG].values()
                 )
     return (umi_results_matrix, read_results_matrix)
-
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
         "scipy>=1.1.0",
         "multiprocess>=0.70.6.1",
         "umi_tools==1.0.0",
-        "pytest==4.1.0",
+        "pytest>=8.1.0",
         "pytest-dependency==0.4.0",
         "pandas>=0.23.4",
         "pybktree==1.1",