diff --git a/pycytominer/consensus.py b/pycytominer/consensus.py index 7ea19714..de0386e5 100644 --- a/pycytominer/consensus.py +++ b/pycytominer/consensus.py @@ -35,7 +35,7 @@ def consensus( features : list A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. - Defaults to "infer". If "infer", then assume cell painting features are those + Defaults to "infer". If "infer", then assume features are from CellProfiler output and prefixed with "Cells", "Nuclei", or "Cytoplasm". output_file : str, optional If provided, will write consensus profiles to file. If not specified, will diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py index 1144a9a4..1d1e4d0d 100644 --- a/pycytominer/cyto_utils/features.py +++ b/pycytominer/cyto_utils/features.py @@ -80,7 +80,7 @@ def infer_cp_features( metadata=False, image_features=False, ): - """Given a dataframe, output features that we expect to be Cell Painting features. + """Given CellProfiler output data read as a DataFrame, output feature column names as a list. Parameters ---------- @@ -90,6 +90,8 @@ def infer_cp_features( Compartments from which Cell Painting features were extracted. metadata : bool, default False Whether or not to infer metadata features. + If metadata is set to True, find column names that begin with the `Metadata_` prefix. + This convention is expected by CellProfiler defaults. image_features : bool, default False Whether or not the profiles contain image features. @@ -115,9 +117,12 @@ def infer_cp_features( population_df.columns.str.startswith("Metadata_") ].tolist() - assert ( # noqa: S101 - len(features) > 0 - ), "No CP features found. Are you sure this dataframe is from CellProfiler?" + if len(features) == 0: + raise ValueError( + "No features or metadata found. Pycytominer expects CellProfiler column names by default. " + "If you're using non-CellProfiler data, please do not 'infer' features. " + "Instead, check if the function has a `features` or `meta_features` parameter, and input column names manually." + ) return features @@ -150,7 +155,9 @@ def drop_outlier_features( population_df : pandas.core.frame.DataFrame DataFrame that includes metadata and observation features. features : list of str or str, default "infer" - Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_" + Features present in the population dataframe. If "infer", + then assume CellProfiler feature conventions + (start with "Cells_", "Nuclei_", or "Cytoplasm_") samples : str, default "all" List of samples to perform operation on. The function uses a pd.DataFrame.query() function, so you should structure samples in this fashion. An example is diff --git a/pycytominer/cyto_utils/modz.py b/pycytominer/cyto_utils/modz.py index 6e598ed5..6ea4c38c 100644 --- a/pycytominer/cyto_utils/modz.py +++ b/pycytominer/cyto_utils/modz.py @@ -98,9 +98,10 @@ def modz( a string or list of column(s) in the population dataframe that indicate replicate level information features : list, default "infer" - List of features present in the population dataframe [default: "infer"] - if "infer", then assume cell painting features are those that start with - "Cells_", "Nuclei_", or "Cytoplasm_". + A list of strings corresponding to feature measurement column names in the + `population_df` DataFrame. All features listed must be found in `population_df`. + Defaults to "infer". If "infer", then assume CellProfiler features are those + prefixed with "Cells", "Nuclei", or "Cytoplasm". method : str, default "spearman" indicating which correlation metric to use. min_weight : float, default 0.01 diff --git a/pycytominer/cyto_utils/write_gct.py b/pycytominer/cyto_utils/write_gct.py index 812811bd..1feaab38 100644 --- a/pycytominer/cyto_utils/write_gct.py +++ b/pycytominer/cyto_utils/write_gct.py @@ -32,7 +32,7 @@ def write_gct( features : list A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. - Defaults to "infer". If "infer", then assume cell painting features are those + Defaults to "infer". If "infer", then assume features are from CellProfiler output and prefixed with "Cells", "Nuclei", or "Cytoplasm". meta_features : list A list of strings corresponding to metadata column names in the `profiles` diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py index c1fd87d1..7fc1efab 100644 --- a/pycytominer/feature_select.py +++ b/pycytominer/feature_select.py @@ -43,10 +43,10 @@ def feature_select( ---------- profiles : pandas.core.frame.DataFrame or file DataFrame or file of profiles. - features : list + features : list, default "infer" A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. - Defaults to "infer". If "infer", then assume cell painting features are those + Defaults to "infer". If "infer", then assume CellProfiler features are those prefixed with "Cells", "Nuclei", or "Cytoplasm". image_features: bool, default False Whether the profiles contain image features. diff --git a/pycytominer/normalize.py b/pycytominer/normalize.py index 7a83ca5f..06c55fb6 100644 --- a/pycytominer/normalize.py +++ b/pycytominer/normalize.py @@ -34,14 +34,15 @@ def normalize( features : list A list of strings corresponding to feature measurement column names in the `profiles` DataFrame. All features listed must be found in `profiles`. - Defaults to "infer". If "infer", then assume cell painting features are those + Defaults to "infer". If "infer", then assume features are from CellProfiler output and prefixed with "Cells", "Nuclei", or "Cytoplasm". image_features: bool, default False Whether the profiles contain image features. meta_features : list A list of strings corresponding to metadata column names in the `profiles` DataFrame. All features listed must be found in `profiles`. Defaults to "infer". - If "infer", then assume metadata features are those prefixed with "Metadata" + If "infer", then assume CellProfiler metadata features, identified by + column names that begin with the `Metadata_` prefix." samples : str The metadata column values to use as a normalization reference. We often use control samples. The function uses a pd.query() function, so you should @@ -114,7 +115,7 @@ def normalize( normalized_df = normalize( profiles=data_df, features=["x", "y", "z", "zz"], - meta_features="infer", + meta_features=["Metadata_plate", "Metadata_treatment"], samples="Metadata_treatment == 'control'", method="standardize" ) diff --git a/pycytominer/operations/correlation_threshold.py b/pycytominer/operations/correlation_threshold.py index 7c4522ba..a888a012 100644 --- a/pycytominer/operations/correlation_threshold.py +++ b/pycytominer/operations/correlation_threshold.py @@ -20,9 +20,10 @@ def correlation_threshold( population_df : pandas.core.frame.DataFrame DataFrame that includes metadata and observation features. features : list, default "infer" - List of features present in the population dataframe [default: "infer"] - if "infer", then assume cell painting features are those that start with - "Cells_", "Nuclei_", or "Cytoplasm_". + A list of strings corresponding to feature measurement column names in the + `population_df` DataFrame. All features listed must be found in `population_df`. + Defaults to "infer". If "infer", then assume CellProfiler features are those + prefixed with "Cells", "Nuclei", or "Cytoplasm". samples : str, default "all" List of samples to perform operation on. The function uses a pd.DataFrame.query() function, so you should structure samples in this fashion. An example is diff --git a/pycytominer/operations/get_na_columns.py b/pycytominer/operations/get_na_columns.py index ad36c377..f288f2cd 100644 --- a/pycytominer/operations/get_na_columns.py +++ b/pycytominer/operations/get_na_columns.py @@ -14,9 +14,10 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05): population_df : pandas.core.frame.DataFrame DataFrame that includes metadata and observation features. features : list, default "infer" - List of features present in the population dataframe [default: "infer"] - if "infer", then assume cell painting features are those that start with - "Cells_", "Nuclei_", or "Cytoplasm_". + A list of strings corresponding to feature measurement column names in the + `profiles` DataFrame. All features listed must be found in `profiles`. + Defaults to "infer". If "infer", then assume CellProfiler features are those + prefixed with "Cells", "Nuclei", or "Cytoplasm". samples : str, default "all" List of samples to perform operation on. The function uses a pd.DataFrame.query() function, so you should structure samples in this fashion. An example is @@ -36,8 +37,8 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05): if features == "infer": features = infer_cp_features(population_df) - else: - population_df = population_df.loc[:, features] + + population_df = population_df.loc[:, features] num_rows = population_df.shape[0] na_prop_df = population_df.isna().sum() / num_rows diff --git a/pycytominer/operations/noise_removal.py b/pycytominer/operations/noise_removal.py index aba1a29e..e3e41923 100644 --- a/pycytominer/operations/noise_removal.py +++ b/pycytominer/operations/noise_removal.py @@ -22,9 +22,10 @@ def noise_removal( The list of unique perturbations corresponding to the rows in population_df. For example, perturb1_well1 and perturb1_well2 would both be "perturb1". features : list, default "infer" - List of features present in the population dataframe [default: "infer"] - if "infer", then assume cell painting features are those that start with - "Cells_", "Nuclei_", or "Cytoplasm_". + A list of strings corresponding to feature measurement column names in the + `population_df` DataFrame. All features listed must be found in `population_df`. + Defaults to "infer". If "infer", then assume CellProfiler features are those + prefixed with "Cells", "Nuclei", or "Cytoplasm". samples : str, default "all" List of samples to perform operation on. The function uses a pd.DataFrame.query() function, so you should structure samples in this fashion. An example is diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py index 67d3b767..72da751d 100644 --- a/pycytominer/operations/variance_threshold.py +++ b/pycytominer/operations/variance_threshold.py @@ -18,9 +18,10 @@ def variance_threshold( population_df : pandas.core.frame.DataFrame DataFrame that includes metadata and observation features. features : list, default "infer" - List of features present in the population dataframe [default: "infer"] - if "infer", then assume cell painting features are those that start with - "Cells_", "Nuclei_", or "Cytoplasm_". + A list of strings corresponding to feature measurement column names in the + `population_df` DataFrame. All features listed must be found in `population_df`. + Defaults to "infer". If "infer", then assume CellProfiler features are those + prefixed with "Cells", "Nuclei", or "Cytoplasm". samples : str, default "all" List of samples to perform operation on. The function uses a pd.DataFrame.query() function, so you should structure samples in this fashion. An example is diff --git a/tests/test_cyto_utils/test_feature_infer.py b/tests/test_cyto_utils/test_feature_infer.py index df839858..9c8e8e2a 100644 --- a/tests/test_cyto_utils/test_feature_infer.py +++ b/tests/test_cyto_utils/test_feature_infer.py @@ -39,10 +39,10 @@ def test_feature_infer(): def test_feature_infer_nocp(): - with pytest.raises(AssertionError) as nocp: + with pytest.raises(ValueError) as nocp: infer_cp_features(population_df=non_cp_data_df) - assert "No CP features found." in str(nocp.value) + assert "No features or metadata found." in str(nocp.value) def test_metadata_feature_infer(): diff --git a/tests/test_operations/test_correlation_threshold.py b/tests/test_operations/test_correlation_threshold.py index 2ca393cc..9845b99f 100644 --- a/tests/test_operations/test_correlation_threshold.py +++ b/tests/test_operations/test_correlation_threshold.py @@ -75,7 +75,7 @@ def test_correlation_threshold_samples(): def test_correlation_threshold_featureinfer(): - with pytest.raises(AssertionError) as nocp: + with pytest.raises(ValueError) as nocp: correlation_threshold_result = correlation_threshold( population_df=data_df, features="infer", @@ -84,7 +84,7 @@ def test_correlation_threshold_featureinfer(): method="pearson", ) - assert "No CP features found." in str(nocp.value) + assert "No features found." in str(nocp.value) data_cp_df = data_df.copy() data_cp_df.columns = [f"Cells_{x}" for x in data_df.columns] diff --git a/tests/test_operations/test_get_na_columns.py b/tests/test_operations/test_get_na_columns.py index 9c8bd557..48b5ab71 100644 --- a/tests/test_operations/test_get_na_columns.py +++ b/tests/test_operations/test_get_na_columns.py @@ -67,9 +67,9 @@ def test_get_na_columns_sample(): def test_get_na_columns_featureinfer(): - with pytest.raises(AssertionError) as nocp: + with pytest.raises(ValueError) as nocp: get_na_columns( population_df=data_df, samples="all", features="infer", cutoff=0.1 ) - assert "No CP features found." in str(nocp.value) + assert "No features found." in str(nocp.value) diff --git a/tests/test_operations/test_variance_threshold.py b/tests/test_operations/test_variance_threshold.py index a1a19764..1d7cd481 100644 --- a/tests/test_operations/test_variance_threshold.py +++ b/tests/test_operations/test_variance_threshold.py @@ -102,12 +102,12 @@ def test_variance_threshold(): def test_variance_threshold_featureinfer(): unique_cut = 0.01 - with pytest.raises(AssertionError) as nocp: + with pytest.raises(ValueError) as nocp: excluded_features = variance_threshold( population_df=data_unique_test_df, features="infer", unique_cut=unique_cut ) - assert "No CP features found." in str(nocp.value) + assert "No features found." in str(nocp.value) data_cp_df = data_unique_test_df.copy() data_cp_df.columns = [f"Cells_{x}" for x in data_unique_test_df.columns]