fix(cudf): more skrub upgrade fixes

lmeyerov · lmeyerov · commit afff91d1b300 · 2025-02-02T22:28:05.000-08:00
diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py
@@ -960,10 +960,20 @@ def process_dirty_dataframes(
 
         logger.info(":: Encoding DataFrame might take a few minutes ------")
         
+        if 'cudf' in str(getmodule(ndf)):
+            import cudf
+            assert isinstance(ndf, cudf.DataFrame)
+            logger.debug('Coercing cudf to pandas for skrub, with feature_engine=%s', feature_engine)
+            ndf_passthrough = ndf.to_pandas()
+            coercing_to_pandas = True
+        else:
+            ndf_passthrough = ndf
+            coercing_to_pandas = False
+
         try:
-            X_enc = data_encoder.fit_transform(ndf, y)
+            X_enc = data_encoder.fit_transform(ndf_passthrough, y)
         except TypeError:
-            nndf = ndf.copy()
+            nndf = ndf_passthrough.copy()
             object_columns = nndf.select_dtypes(include=['object']).columns
             nndf[object_columns] = nndf[object_columns].astype(str)
             X_enc = data_encoder.fit_transform(nndf, y)
@@ -990,9 +1000,14 @@ def process_dirty_dataframes(
         data_encoder.get_feature_names_out = callThrough(features_transformed)
         
         X_enc = pd.DataFrame(
-            X_enc, columns=features_transformed, index=ndf.index
+            X_enc, columns=features_transformed, index=ndf_passthrough.index
         )
         X_enc = X_enc.fillna(0.0)
+
+        if coercing_to_pandas:
+            import cudf
+            X_enc = cudf.DataFrame.from_pandas(X_enc)
+
     elif not all_numeric and (not has_skrub or feature_engine in ["pandas", "none"]):
         numeric_ndf = ndf.select_dtypes(include=[np.number])  # type: ignore
         logger.warning("-*-*- DataFrame is not numeric and no skrub, dropping non-numeric")
diff --git a/graphistry/tests/layout/ring/test_ring_categorical.py b/graphistry/tests/layout/ring/test_ring_categorical.py
@@ -131,7 +131,7 @@ def test_ring_cudf(self):
         rs = (g._nodes['x'] * g._nodes['x'] + g._nodes['y'] * g._nodes['y']).apply(np.sqrt)
         assert rs.min() == 500
         assert rs.max() == 800
-        assert len(g._complex_encodings and g._complex_encodings['node_encodings']['default']['pointAxisEncoding']['rows']) == 5
+        assert len(g._complex_encodings and g._complex_encodings['node_encodings']['default']['pointAxisEncoding']['rows']) == 4
         for i, row in enumerate(g._complex_encodings['node_encodings']['default']['pointAxisEncoding']['rows']):
             assert row['r'] == 500 + 100 * i
-            assert row['label'] == str(2 + 2 * i)
+            assert row['label'] == ['a', 'bb', 'cc', 'dd'][i]
diff --git a/graphistry/text_utils.py b/graphistry/text_utils.py
@@ -1,4 +1,5 @@
 from typing import TYPE_CHECKING
+from inspect import getmodule
 from logging import getLogger
 import pandas as pd
 
@@ -40,6 +41,8 @@ def build_index(self, angular=False, n_trees=None):
         self.assert_fitted()
         self.assert_features_line_up_with_nodes()
         X = self._get_feature("nodes")
+        if 'cudf' in str(getmodule(X)):
+            X = X.to_pandas()
         self.search_index = FaissVectorSearch(
             X.values
         )  # self._build_search_index(X, angular, n_trees, faiss=False)
@@ -48,6 +51,10 @@ def _query_from_dataframe(self, qdf: pd.DataFrame, top_n: int, thresh: float):
         # Use the loaded featurizers to transform the dataframe
         vect, _ = self.transform(qdf, None, kind="nodes", return_graph=False)
 
+        nodes = self._nodes
+        if 'cudf' in str(getmodule(nodes)):
+            nodes = nodes.to_pandas()
+
         results = self.search_index.search_df(vect, self._nodes, top_n)
         results = results.query(f"{DISTANCE} < {thresh}")
 
@@ -210,15 +217,40 @@ def search_graph(
         # print('shape of edges', edf.shape)
         rdf = df = res._nodes
         # print('shape of nodes', rdf.shape)
+
+        if 'cudf' in str(getmodule(edges)):
+            import cudf
+
+            if not isinstance(rdf, cudf.DataFrame):
+                rdf = cudf.from_pandas(rdf)
+                df = rdf
+
+            concat = cudf.concat
+            cudf_coercion = True
+        else:
+            concat = pd.concat
+            cudf_coercion = False
+
         node = res._node
         indices = rdf[node]
+        if cudf_coercion:
+            import cudf
+            if not isinstance(indices, cudf.Series):
+                indices = cudf.Series.from_pandas(indices)
         src = res._source
         dst = res._destination
         if query != "":
             # run a real query, else return entire graph
             rdf, _ = res.search(query, thresh=thresh, fuzzy=True, top_n=top_n)
             if not rdf.empty:
+                if cudf_coercion:
+                    import cudf
+                    #if not isinstance(indices, cudf.Series):
+                    #    indices = cudf.Series.from_pandas(indices)
+                    if not isinstance(rdf, cudf.DataFrame):
+                        rdf = cudf.from_pandas(rdf)
                 indices = rdf[node]
+
                 # now get edges from indices
                 if broader:  # this will make a broader graph, finding NN in src OR dst
                     edges = edf[(edf[src].isin(indices)) | (edf[dst].isin(indices))]
@@ -236,19 +268,35 @@ def search_graph(
         except:  # for explicit edges
             pass
 
-        found_indices = pd.concat([edges[src], edges[dst], indices], axis=0).unique()
+        #logger.info('type edges=%s, indices=%s', type(edges), type(indices))
+        #raise ValueError(f'stop here: {type(edges)}, {type(indices)}')
+
+        found_indices = concat([edges[src], edges[dst], indices], axis=0).unique()
         emb = None
+        node_feats = res._node_features
+        if cudf_coercion:
+            import cudf
+            if not isinstance(node_feats, cudf.DataFrame):
+                node_feats = cudf.from_pandas(node_feats)
+
+        node_emb = res._node_embedding
+        if cudf_coercion and res._umap is not None:
+            import cudf
+            node_emb = res._node_embedding
+            if not isinstance(node_emb, cudf.DataFrame):
+                node_emb = cudf.from_pandas(node_emb)
+
         try:
             tdf = rdf.iloc[found_indices]
-            feats = res._node_features.iloc[found_indices]  # type: ignore
+            feats = node_feats.iloc[found_indices]  # type: ignore
             if res._umap is not None:
-                emb = res._node_embedding.iloc[found_indices]  # type: ignore
+                emb = node_emb.iloc[found_indices]  # type: ignore
         except Exception:  # for explicit relabeled nodes
             #logger.exception(e)
             tdf = rdf[df[node].isin(found_indices)]
-            feats = res._node_features.loc[tdf.index]  # type: ignore
+            feats = node_feats.loc[tdf.index]  # type: ignore
             if res._umap is not None:
-                emb = res._node_embedding[df[node].isin(found_indices)]  # type: ignore
+                emb = node_emb[df[node].isin(found_indices)]  # type: ignore
         logger.info(f" - Returning edge dataframe of size {edges.shape[0]}")
         # get all the unique nodes
         logger.info(