From e9e1e6b49e81d5c44b221dc3ec154ebf1134e155 Mon Sep 17 00:00:00 2001
From: Tobias Krause <tobiaspk1@gmail.com>
Date: Tue, 22 Apr 2025 10:11:48 -0400
Subject: [PATCH 1/6] Enforce cell order to comply with pandas > 1.5

---
 cite_seq_count/__main__.py | 12 ++++++++----
 cite_seq_count/io.py       |  8 ++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index bf1c3ba..52b93c1 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -571,7 +571,7 @@ def main():
     else:
         # Select top cells based on total umis per cell
         top_cells_tuple = umis_per_cell.most_common(args.expected_cells)
-        top_cells = set([pair[0] for pair in top_cells_tuple])
+        top_cells_set = set([pair[0] for pair in top_cells_tuple])
 
     # UMI correction
 
@@ -581,16 +581,20 @@ def main():
         aberrant_cells = set()
     else:
         # Correct UMIS
-        (final_results, umis_corrected, aberrant_cells) = processing.correct_umis(
+        (final_results, umis_corrected, aberrant_cells_set) = processing.correct_umis(
             final_results=final_results,
             collapsing_threshold=args.umi_threshold,
-            top_cells=top_cells,
+            top_cells=top_cells_set,
             max_umis=20000,
         )
 
     # Remove aberrant cells from the top cells
     for cell_barcode in aberrant_cells:
-        top_cells.remove(cell_barcode)
+        top_cells_set.remove(cell_barcode)
+
+    # Ensure cell order (required for pandas>=2.0.0)
+    top_cells = list(top_cells_set)
+    aberrant_cells = list(aberrant_cells_set)
 
     # Create sparse aberrant cells matrix
     (umi_aberrant_matrix, read_aberrant_matrix) = processing.generate_sparse_matrices(
diff --git a/cite_seq_count/io.py b/cite_seq_count/io.py
index 2dc04f0..c13bd88 100644
--- a/cite_seq_count/io.py
+++ b/cite_seq_count/io.py
@@ -12,7 +12,7 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol
 
     Args:
         sparse_matrix (dok_matrix): Results in a sparse matrix.
-        top_cells (set): Set of cells that are selected for output.
+        top_cells (list): Set of cells that are selected for output.
         ordered_tags_map (dict): Tags in order with indexes as values.
         data_type (string): A string definning if the data is umi or read based.
         outfolder (string): Path to the output folder.
@@ -35,11 +35,11 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol
 def write_dense(sparse_matrix, index, columns, outfolder, filename):
     """
     Writes a dense matrix in a csv format
-    
+
     Args:
        sparse_matrix (dok_matrix): Results in a sparse matrix.
        index (list): List of TAGS
-       columns (set): List of cells
+       columns (list): List of cells
        outfolder (str): Output folder
        filename (str): Filename
     """
@@ -59,7 +59,7 @@ def write_unmapped(merged_no_match, top_unknowns, outfolder, filename):
         outfolder (string): Path of the output folder
         filename (string): Name of the output file
     """
-    
+
     top_unmapped = merged_no_match.most_common(top_unknowns)
 
     with open(os.path.join(outfolder, filename),'w') as unknown_file:

From e5fbcd254ab4eb4725658ca699dcd598b0fff866 Mon Sep 17 00:00:00 2001
From: Tobias Krause <tobiaspk1@gmail.com>
Date: Tue, 22 Apr 2025 10:12:19 -0400
Subject: [PATCH 2/6] Update related docstrings and package requirements to
 suport tests on python >3.9

---
 cite_seq_count/processing.py | 15 +++++++--------
 setup.py                     |  2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py
index 57b35b7..d498b40 100644
--- a/cite_seq_count/processing.py
+++ b/cite_seq_count/processing.py
@@ -105,7 +105,7 @@ def map_reads(
     Args:
         read1_path (string): Path to R1.fastq.gz
         read2_path (string): Path to R2.fastq.gz
-        chunk_size (int): The number of lines to process 
+        chunk_size (int): The number of lines to process
         tags (dict): A dictionary with the TAGs + TAG Names.
         barcode_slice (slice): A slice for extracting the Barcode portion from the
             sequence.
@@ -234,13 +234,13 @@ def merge_results(parallel_results):
 def correct_umis(final_results, collapsing_threshold, top_cells, max_umis):
     """
     Corrects umi barcodes within same cell/tag groups.
-    
+
     Args:
         final_results (dict): Dict of dict of Counters with mapping results.
         collapsing_threshold (int): Max distance between umis.
         top_cells (set): Set of cells to go through.
         max_umis (int): Maximum UMIs to consider for one cluster.
-    
+
     Returns:
         final_results (dict): Same as input but with corrected umis.
         corrected_umis (int): How many umis have been corrected.
@@ -339,14 +339,14 @@ def correct_cells(
 ):
     """
     Corrects cell barcodes.
-    
+
     Args:
         final_results (dict): Dict of dict of Counters with mapping results.
         umis_per_cell (Counter): Counter of number of umis per cell.
         collapsing_threshold (int): Max distance between umis.
         expected_cells (int): Number of expected cells.
         ab_map (dict): Dict of the TAGS.
-    
+
     Returns:
         final_results (dict): Same as input but with corrected umis.
         umis_per_cell (Counter): Counter of umis per cell after cell barcode correction
@@ -375,7 +375,7 @@ def correct_cells_whitelist(
 ):
     """
     Corrects cell barcodes.
-    
+
     Args:
         final_results (dict): Dict of dict of Counters with mapping results.
         umis_per_cell (Counter): Counter of UMIs per cell.
@@ -383,7 +383,7 @@ def correct_cells_whitelist(
         collapsing_threshold (int): Max distance between umis.
         ab_map (OrederedDict): Tags in an ordered dict.
 
-    
+
     Returns:
         final_results (dict): Same as input but with corrected umis.
         umis_per_cell (Counter): Updated UMI counts after correction.
@@ -479,4 +479,3 @@ def generate_sparse_matrices(final_results, ordered_tags_map, top_cells):
                     final_results[cell_barcode][TAG].values()
                 )
     return (umi_results_matrix, read_results_matrix)
-
diff --git a/setup.py b/setup.py
index 6423bc3..bfb38e8 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
         "scipy>=1.1.0",
         "multiprocess>=0.70.6.1",
         "umi_tools==1.0.0",
-        "pytest==4.1.0",
+        "pytest>=8.1.0",
         "pytest-dependency==0.4.0",
         "pandas>=0.23.4",
         "pybktree==1.1",

From 059e2d0695d50e9551dbfed54ace502f8095965d Mon Sep 17 00:00:00 2001
From: Tobias Krause <tobiaspk1@gmail.com>
Date: Tue, 22 Apr 2025 10:22:36 -0400
Subject: [PATCH 3/6] version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index bfb38e8..1fa4278 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="CITE-seq-Count",
-    version="1.4.5",
+    version="1.4.5.pr.197",
     author="Roelli Patrick",
     author_email="patrick.roelli@gmail.com",
     description="A python package to map reads from CITE-seq or hashing data for single cell experiments",

From 6fae16d331b958fff133d71e47ff2139f43989c4 Mon Sep 17 00:00:00 2001
From: Tobias Krause <tobiaspk1@gmail.com>
Date: Tue, 22 Apr 2025 11:50:55 -0400
Subject: [PATCH 4/6] Bugfix to support custom whitelist with new top_cells_set

---
 cite_seq_count/__main__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index 52b93c1..1589b8f 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -558,7 +558,7 @@ def main():
 
     # If given, use whitelist for top cells
     if whitelist:
-        top_cells = whitelist
+        top_cells_set = whitelist
         # Add potential missing cell barcodes.
         for missing_cell in whitelist:
             if missing_cell in final_results:
@@ -567,7 +567,7 @@ def main():
                 final_results[missing_cell] = dict()
                 for TAG in ordered_tags_map:
                     final_results[missing_cell][TAG] = Counter()
-                top_cells.add(missing_cell)
+                top_cells_set.add(missing_cell)
     else:
         # Select top cells based on total umis per cell
         top_cells_tuple = umis_per_cell.most_common(args.expected_cells)

From 489f831fb420b5cc904cbf844e87c9f0b73a511b Mon Sep 17 00:00:00 2001
From: Tobias Krause <tobiaspk1@gmail.com>
Date: Tue, 22 Apr 2025 11:51:19 -0400
Subject: [PATCH 5/6] Update version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1fa4278..bfb38e8 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="CITE-seq-Count",
-    version="1.4.5.pr.197",
+    version="1.4.5",
     author="Roelli Patrick",
     author_email="patrick.roelli@gmail.com",
     description="A python package to map reads from CITE-seq or hashing data for single cell experiments",

From f1be95cf1192d99fdb8a2ad68240af44ab1ecfa8 Mon Sep 17 00:00:00 2001
From: Tobias Krause <tobiaspk1@gmail.com>
Date: Tue, 22 Apr 2025 13:26:19 -0400
Subject: [PATCH 6/6] Debug variable assignment

---
 cite_seq_count/__main__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py
index 1589b8f..274ff9e 100755
--- a/cite_seq_count/__main__.py
+++ b/cite_seq_count/__main__.py
@@ -578,7 +578,7 @@ def main():
     if args.no_umi_correction:
         # Don't correct
         umis_corrected = 0
-        aberrant_cells = set()
+        aberrant_cells_set = set()
     else:
         # Correct UMIS
         (final_results, umis_corrected, aberrant_cells_set) = processing.correct_umis(
@@ -589,7 +589,7 @@ def main():
         )
 
     # Remove aberrant cells from the top cells
-    for cell_barcode in aberrant_cells:
+    for cell_barcode in aberrant_cells_set:
         top_cells_set.remove(cell_barcode)
 
     # Ensure cell order (required for pandas>=2.0.0)