Skip to content

Commit 14e793c

Browse files
authored
Merge pull request #463 from PNNL-CompBio/adding-docstrings
Adding docstrings
2 parents 3f0c671 + ff1f26d commit 14e793c

File tree

2 files changed

+257
-9
lines changed

2 files changed

+257
-9
lines changed

coderdata/dataset/dataset.py

Lines changed: 236 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ def __init__(
6464
6565
Parameters
6666
----------
67-
name : str
68-
The name of the dataset that is stored in the object
67+
name : str, optional
68+
The name of the dataset that is stored in the object, by default None
6969
transcriptomics : pd.DataFrame, optional
7070
_description_, by default None
7171
proteomics : pd.DataFrame, optional
@@ -322,7 +322,43 @@ def split_train_other(
322322
random_state: Optional[Union[int,RandomState]]=None,
323323
**kwargs: dict,
324324
) -> TwoWaySplit:
325+
"""
326+
Split the dataset into training and another subset (e.g., testing or validation).
325327
328+
Parameters
329+
----------
330+
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
331+
The type of split to perform, by default 'mixed-set'.
332+
- `mixed-set`: A random split, disregarding drug or cancer associations.
333+
- `drug-blind`: Ensures disjoint splits by drug ID.
334+
- `cancer-blind`: Ensures disjoint splits by sample or cancer association.
335+
ratio : tuple[int, int], optional
336+
The ratio of train to other subset sizes, by default (8, 2).
337+
For instance, (8, 2) translates to an 80%-20% split.
338+
stratify_by : str, optional
339+
The column used for stratification, if stratification is needed, by default None.
340+
balance : bool, optional
341+
Whether to adjust to balanced splits (equal representation of classes), by default False.
342+
random_state : int | RandomState | None, optional
343+
A seed for reproducibility of the random split, by default None.
344+
**kwargs : dict
345+
Additional arguments for advanced customization of the split.
346+
347+
Returns
348+
-------
349+
TwoWaySplit
350+
An object containing the train and other subsets as separate datasets.
351+
352+
Notes
353+
-----
354+
This method is a wrapper around the `split_train_other` utility function and
355+
ensures that the split configuration is applied to the dataset (self).
356+
357+
Examples
358+
--------
359+
>>> split = dataset.split_train_other(split_type='cancer-blind', ratio=(7,3))
360+
>>> print(split.train, split.other)
361+
"""
326362
split = split_train_other(
327363
data=self,
328364
split_type=split_type,
@@ -347,6 +383,47 @@ def split_train_test_validate(
347383
random_state: Optional[Union[int,RandomState]]=None,
348384
**kwargs: dict,
349385
) -> Split:
386+
"""
387+
Split the dataset into training, testing, and validation subsets.
388+
389+
Parameters
390+
----------
391+
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
392+
Defines the type of splitting to perform, by default 'mixed-set'.
393+
- `mixed-set`: Data is split randomly, disregarding drug or cancer associations.
394+
- `drug-blind`: Ensures disjoint splits by drug association.
395+
- `cancer-blind`: Ensures disjoint splits by sample or cancer association.
396+
ratio : tuple[int, int, int], optional
397+
Defines the ratio of train, test, and validate sizes, e.g., (8,1,1)
398+
means 80% train, 10% test, 10% validation.
399+
stratify_by : str, optional
400+
Column to use for stratification, if required, by default None.
401+
balance : bool, optional
402+
Whether to balance the splits (equal representation of classes), by default False.
403+
random_state : int | RandomState | None, optional
404+
A random seed for reproducibility, by default None.
405+
**kwargs : dict
406+
Additional arguments for customization of the split logic.
407+
408+
Returns
409+
-------
410+
Split
411+
A Split object containing the training, testing, and validation subsets.
412+
413+
Notes
414+
-----
415+
- This method uses the `split_train_test_validate` utility function internally.
416+
- Ensures disjoint subsets based on the specified splitting criteria, especially
417+
for `drug-blind` and `cancer-blind` splits.
418+
- Includes options for stratifying splits based on a drug response metric.
419+
420+
Examples
421+
--------
422+
>>> split = dataset.split_train_test_validate(
423+
... split_type='drug-blind', ratio=(7,2,1), stratify_by='auc'
424+
... )
425+
>>> print(split.train, split.test, split.validate)
426+
"""
350427
split = split_train_test_validate(
351428
data=self,
352429
split_type=split_type,
@@ -371,7 +448,46 @@ def train_test_validate(
371448
random_state: Optional[Union[int,RandomState]]=None,
372449
**kwargs: dict,
373450
) -> Split:
451+
"""
452+
Split the dataset into training, testing, and validation subsets.
374453
454+
Parameters
455+
----------
456+
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
457+
Defines the type of splitting, by default 'mixed-set'.
458+
- `mixed-set`: Random splitting, disregarding drug or cancer associations.
459+
- `drug-blind`: Ensures disjoint splits based on drug associations.
460+
- `cancer-blind`: Ensures disjoint splits based on cancer or sample associations.
461+
ratio : tuple[int, int, int], optional
462+
The proportion of data for train, test, and validation splits
463+
(e.g., (8,1,1) means 80% train, 10% test, 10% validation), by default (8,1,1).
464+
stratify_by : str, optional
465+
The column used for stratification (e.g., a drug response metric), by default None.
466+
balance : bool, optional
467+
Whether to adjust splits to ensure balanced classes, by default False.
468+
random_state : int | RandomState | None, optional
469+
Random seed for reproducibility, by default None.
470+
**kwargs : dict
471+
Additional arguments for customization, passed to the stratification logic.
472+
473+
Returns
474+
-------
475+
Split
476+
An object containing the training, testing, and validation subsets.
477+
478+
Notes
479+
-----
480+
- This method wraps around the `split_train_test_validate` utility function.
481+
- Useful for creating disjoint and optionally stratified splits of the dataset.
482+
- Supports reproducibility through `random_state`.
483+
484+
Examples
485+
--------
486+
>>> split = dataset.train_test_validate(
487+
... split_type='cancer-blind', ratio=(6,2,2), stratify_by='fit_auc'
488+
... )
489+
>>> print(split.train, split.test, split.validate)
490+
"""
375491
split = split_train_test_validate(
376492
data=self,
377493
split_type=split_type,
@@ -386,6 +502,14 @@ def train_test_validate(
386502

387503

388504
def types(self) -> list:
505+
"""
506+
Get the data types available in the dataset.
507+
508+
Returns
509+
-------
510+
list
511+
A list of available data types (e.g., 'transcriptomics', 'proteomics').
512+
"""
389513
data_types = [
390514
'transcriptomics',
391515
'proteomics',
@@ -407,7 +531,18 @@ def types(self) -> list:
407531
return data_types_present
408532

409533
def save(self, path: Path) -> None:
534+
"""
535+
Save the dataset to a file.
536+
537+
Parameters
538+
----------
539+
path : Path
540+
The file path where the dataset will be saved.
410541
542+
Returns
543+
-------
544+
None
545+
"""
411546
with open(path, 'wb') as f_path:
412547
pickle.dump(self, file=f_path)
413548

@@ -422,28 +557,54 @@ def load(
422557
local_path: Union[str,Path]=Path.cwd(),
423558
from_pickle:bool=False
424559
) -> Dataset:
560+
425561
"""
426-
_summary_
562+
Load a dataset from local files.
563+
564+
This function allows loading either from raw data files (e.g., CSV, TSV)
565+
or from a pickled file. The raw data is parsed and indexed into a `Dataset`
566+
object based on predefined types. If pickled data is available, it can be
567+
directly loaded for faster access.
427568
428569
Parameters
429570
----------
430571
name : str
431-
_description_
432-
directory : str | Path, optional
433-
_description_, by default Path.cwd()
572+
The name of the dataset to load (used as a filename prefix).
573+
local_path : str | Path, optional
574+
The local directory where the dataset files are located, by default the current working directory.
575+
from_pickle : bool, optional
576+
If True, attempts to load the dataset from a pickled file, by default False.
434577
435578
Returns
436579
-------
437580
Dataset
438-
_description_
581+
An object containing the loaded dataset with attributes for specific data types like 'transcriptomics',
582+
'proteomics', 'mutations', etc.
439583
440584
Raises
441585
------
442586
OSError
443-
_description_
587+
If the specified directory does not exist.
444588
TypeError
445-
_description_
589+
If the provided path is not a valid path.
590+
FileNotFoundError
591+
If no suitable pickled file is found when `from_pickle=True`.
592+
593+
Notes
594+
-----
595+
- When loading from raw files, supported file formats are `.csv`, `.tsv`, `.csv.gz`, `.tsv.gz`.
596+
- The `genes` dataset is subsetted to include only genes relevant to other subdatasets ('transcriptomics', 'proteomics', etc.).
597+
- When loading from pickle, the function looks for files with extensions `.pkl` or `.pickle`.
598+
599+
Examples
600+
--------
601+
Load a dataset from raw files:
602+
>>> dataset = load(name='my_dataset', local_path='/data/datasets')
603+
604+
Load a dataset from a pickled file:
605+
>>> dataset = load(name='my_dataset', local_path='/data/datasets', from_pickle=True)
446606
"""
607+
447608

448609
data_types_to_load = (
449610
'transcriptomics',
@@ -563,6 +724,22 @@ def format(
563724
remove_na: bool=False,
564725
**kwargs: dict,
565726
):
727+
"""
728+
Format the dataset according to the specified type.
729+
730+
Parameters
731+
----------
732+
data_type : str
733+
The type of data to format (e.g., 'transcriptomics', 'mutations').
734+
remove_na : bool, optional
735+
Whether to remove rows with missing values, by default False.
736+
**kwargs : dict
737+
Additional arguments for customization.
738+
739+
Returns
740+
-------
741+
Formatted data based on the requested type.
742+
"""
566743

567744
if data_type == "transcriptomics":
568745
if data.transcriptomics is None:
@@ -759,6 +936,31 @@ def split_train_other(
759936
random_state: Optional[Union[int,RandomState]]=None,
760937
**kwargs: dict,
761938
):
939+
940+
"""
941+
Split the dataset into training and other subsets.
942+
943+
Parameters
944+
----------
945+
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
946+
The type of splitting to perform, by default 'mixed-set'.
947+
ratio : tuple[int, int], optional
948+
Ratio of train to other split sizes, by default (8, 2).
949+
stratify_by : str, optional
950+
Column to use for stratification, if any, by default None.
951+
balance : bool, optional
952+
Whether to balance the split data, by default False.
953+
random_state : int | RandomState | None, optional
954+
Random seed for reproducibility, by default None.
955+
**kwargs : dict
956+
Additional arguments for customization.
957+
958+
Returns
959+
-------
960+
TwoWaySplit
961+
The resulting datasets in training and other split.
962+
"""
963+
762964
train, other = _split_two_way(
763965
data=data,
764966
split_type=split_type,
@@ -785,6 +987,31 @@ def split_train_test_validate(
785987
random_state: Optional[Union[int,RandomState]]=None,
786988
**kwargs: dict,
787989
) -> Split:
990+
991+
"""
992+
Split the dataset into training, testing, and validation subsets.
993+
994+
Parameters
995+
----------
996+
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
997+
The type of splitting strategy to use, by default 'mixed-set'.
998+
ratio : tuple[int, int, int], optional
999+
Ratio for train, test, and validation sizes, by default (8,1,1).
1000+
stratify_by : str, optional
1001+
Column for stratification, if any, by default None.
1002+
balance : bool, optional
1003+
Whether to balance the splits, by default False.
1004+
random_state : int | RandomState | None, optional
1005+
Random seed for reproducible splits, by default None.
1006+
**kwargs : dict
1007+
Additional arguments for customization.
1008+
1009+
Returns
1010+
-------
1011+
Split
1012+
A Split object with train, test, and validation datasets.
1013+
"""
1014+
7881015
# Type checking split_type
7891016
if split_type not in [
7901017
'mixed-set', 'drug-blind', 'cancer-blind'

coderdata/utils/stats.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,27 @@ def plot_2d_respones_metric(
2222
metric2: str,
2323
**kwargs: dict
2424
) -> None:
25+
"""
26+
Plot a 2D histogram of two response metrics from a dataset.
27+
28+
Parameters
29+
----------
30+
data : cd.Dataset
31+
The dataset containing experiment data.
32+
metric1 : str
33+
The first response metric to plot on the y-axis.
34+
metric2 : str
35+
The second response metric to plot on the x-axis.
36+
**kwargs : dict
37+
Additional keyword arguments for customizing the plot:
38+
- `joint_bins` (int): Number of bins for the joint histogram. Default is 50.
39+
- `marginal_bins` (int): Number of bins for the marginal histograms. Default is 50.
40+
41+
Returns
42+
-------
43+
None
44+
Displays the 2D histogram plot.
45+
"""
2546

2647
data_plot = _prepare_2d_hist_data(
2748
data=data.experiments,

0 commit comments

Comments
 (0)