@@ -64,8 +64,8 @@ def __init__(
6464
6565 Parameters
6666 ----------
67- name : str
68- The name of the dataset that is stored in the object
67+ name : str, optional
68+ The name of the dataset that is stored in the object, by default None
6969 transcriptomics : pd.DataFrame, optional
7070 _description_, by default None
7171 proteomics : pd.DataFrame, optional
@@ -322,7 +322,43 @@ def split_train_other(
322322 random_state : Optional [Union [int ,RandomState ]]= None ,
323323 ** kwargs : dict ,
324324 ) -> TwoWaySplit :
325+ """
326+ Split the dataset into training and another subset (e.g., testing or validation).
325327
328+ Parameters
329+ ----------
330+ split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
331+ The type of split to perform, by default 'mixed-set'.
332+ - `mixed-set`: A random split, disregarding drug or cancer associations.
333+ - `drug-blind`: Ensures disjoint splits by drug ID.
334+ - `cancer-blind`: Ensures disjoint splits by sample or cancer association.
335+ ratio : tuple[int, int], optional
336+ The ratio of train to other subset sizes, by default (8, 2).
337+ For instance, (8, 2) translates to an 80%-20% split.
338+ stratify_by : str, optional
339+ The column used for stratification, if stratification is needed, by default None.
340+ balance : bool, optional
341+ Whether to adjust to balanced splits (equal representation of classes), by default False.
342+ random_state : int | RandomState | None, optional
343+ A seed for reproducibility of the random split, by default None.
344+ **kwargs : dict
345+ Additional arguments for advanced customization of the split.
346+
347+ Returns
348+ -------
349+ TwoWaySplit
350+ An object containing the train and other subsets as separate datasets.
351+
352+ Notes
353+ -----
354+ This method is a wrapper around the `split_train_other` utility function and
355+ ensures that the split configuration is applied to the dataset (self).
356+
357+ Examples
358+ --------
359+ >>> split = dataset.split_train_other(split_type='cancer-blind', ratio=(7,3))
360+ >>> print(split.train, split.other)
361+ """
326362 split = split_train_other (
327363 data = self ,
328364 split_type = split_type ,
@@ -347,6 +383,47 @@ def split_train_test_validate(
347383 random_state : Optional [Union [int ,RandomState ]]= None ,
348384 ** kwargs : dict ,
349385 ) -> Split :
386+ """
387+ Split the dataset into training, testing, and validation subsets.
388+
389+ Parameters
390+ ----------
391+ split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
392+ Defines the type of splitting to perform, by default 'mixed-set'.
393+ - `mixed-set`: Data is split randomly, disregarding drug or cancer associations.
394+ - `drug-blind`: Ensures disjoint splits by drug association.
395+ - `cancer-blind`: Ensures disjoint splits by sample or cancer association.
396+ ratio : tuple[int, int, int], optional
397+ Defines the ratio of train, test, and validate sizes, e.g., (8,1,1)
398+ means 80% train, 10% test, 10% validation.
399+ stratify_by : str, optional
400+ Column to use for stratification, if required, by default None.
401+ balance : bool, optional
402+ Whether to balance the splits (equal representation of classes), by default False.
403+ random_state : int | RandomState | None, optional
404+ A random seed for reproducibility, by default None.
405+ **kwargs : dict
406+ Additional arguments for customization of the split logic.
407+
408+ Returns
409+ -------
410+ Split
411+ A Split object containing the training, testing, and validation subsets.
412+
413+ Notes
414+ -----
415+ - This method uses the `split_train_test_validate` utility function internally.
416+ - Ensures disjoint subsets based on the specified splitting criteria, especially
417+ for `drug-blind` and `cancer-blind` splits.
418+ - Includes options for stratifying splits based on a drug response metric.
419+
420+ Examples
421+ --------
422+ >>> split = dataset.split_train_test_validate(
423+ ... split_type='drug-blind', ratio=(7,2,1), stratify_by='auc'
424+ ... )
425+ >>> print(split.train, split.test, split.validate)
426+ """
350427 split = split_train_test_validate (
351428 data = self ,
352429 split_type = split_type ,
@@ -371,7 +448,46 @@ def train_test_validate(
371448 random_state : Optional [Union [int ,RandomState ]]= None ,
372449 ** kwargs : dict ,
373450 ) -> Split :
451+ """
452+ Split the dataset into training, testing, and validation subsets.
374453
454+ Parameters
455+ ----------
456+ split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
457+ Defines the type of splitting, by default 'mixed-set'.
458+ - `mixed-set`: Random splitting, disregarding drug or cancer associations.
459+ - `drug-blind`: Ensures disjoint splits based on drug associations.
460+ - `cancer-blind`: Ensures disjoint splits based on cancer or sample associations.
461+ ratio : tuple[int, int, int], optional
462+ The proportion of data for train, test, and validation splits
463+ (e.g., (8,1,1) means 80% train, 10% test, 10% validation), by default (8,1,1).
464+ stratify_by : str, optional
465+ The column used for stratification (e.g., a drug response metric), by default None.
466+ balance : bool, optional
467+ Whether to adjust splits to ensure balanced classes, by default False.
468+ random_state : int | RandomState | None, optional
469+ Random seed for reproducibility, by default None.
470+ **kwargs : dict
471+ Additional arguments for customization, passed to the stratification logic.
472+
473+ Returns
474+ -------
475+ Split
476+ An object containing the training, testing, and validation subsets.
477+
478+ Notes
479+ -----
480+ - This method wraps around the `split_train_test_validate` utility function.
481+ - Useful for creating disjoint and optionally stratified splits of the dataset.
482+ - Supports reproducibility through `random_state`.
483+
484+ Examples
485+ --------
486+ >>> split = dataset.train_test_validate(
487+ ... split_type='cancer-blind', ratio=(6,2,2), stratify_by='fit_auc'
488+ ... )
489+ >>> print(split.train, split.test, split.validate)
490+ """
375491 split = split_train_test_validate (
376492 data = self ,
377493 split_type = split_type ,
@@ -386,6 +502,14 @@ def train_test_validate(
386502
387503
388504 def types (self ) -> list :
505+ """
506+ Get the data types available in the dataset.
507+
508+ Returns
509+ -------
510+ list
511+ A list of available data types (e.g., 'transcriptomics', 'proteomics').
512+ """
389513 data_types = [
390514 'transcriptomics' ,
391515 'proteomics' ,
@@ -407,7 +531,18 @@ def types(self) -> list:
407531 return data_types_present
408532
409533 def save (self , path : Path ) -> None :
534+ """
535+ Save the dataset to a file.
536+
537+ Parameters
538+ ----------
539+ path : Path
540+ The file path where the dataset will be saved.
410541
542+ Returns
543+ -------
544+ None
545+ """
411546 with open (path , 'wb' ) as f_path :
412547 pickle .dump (self , file = f_path )
413548
@@ -422,28 +557,54 @@ def load(
422557 local_path : Union [str ,Path ]= Path .cwd (),
423558 from_pickle :bool = False
424559 ) -> Dataset :
560+
425561 """
426- _summary_
562+ Load a dataset from local files.
563+
564+ This function allows loading either from raw data files (e.g., CSV, TSV)
565+ or from a pickled file. The raw data is parsed and indexed into a `Dataset`
566+ object based on predefined types. If pickled data is available, it can be
567+ directly loaded for faster access.
427568
428569 Parameters
429570 ----------
430571 name : str
431- _description_
432- directory : str | Path, optional
433- _description_, by default Path.cwd()
572+ The name of the dataset to load (used as a filename prefix).
573+ local_path : str | Path, optional
574+ The local directory where the dataset files are located, by default the current working directory.
575+ from_pickle : bool, optional
576+ If True, attempts to load the dataset from a pickled file, by default False.
434577
435578 Returns
436579 -------
437580 Dataset
438- _description_
581+ An object containing the loaded dataset with attributes for specific data types like 'transcriptomics',
582+ 'proteomics', 'mutations', etc.
439583
440584 Raises
441585 ------
442586 OSError
443- _description_
587+ If the specified directory does not exist.
444588 TypeError
445- _description_
589+ If the provided path is not a valid path.
590+ FileNotFoundError
591+ If no suitable pickled file is found when `from_pickle=True`.
592+
593+ Notes
594+ -----
595+ - When loading from raw files, supported file formats are `.csv`, `.tsv`, `.csv.gz`, `.tsv.gz`.
596+ - The `genes` dataset is subsetted to include only genes relevant to other subdatasets ('transcriptomics', 'proteomics', etc.).
597+ - When loading from pickle, the function looks for files with extensions `.pkl` or `.pickle`.
598+
599+ Examples
600+ --------
601+ Load a dataset from raw files:
602+ >>> dataset = load(name='my_dataset', local_path='/data/datasets')
603+
604+ Load a dataset from a pickled file:
605+ >>> dataset = load(name='my_dataset', local_path='/data/datasets', from_pickle=True)
446606 """
607+
447608
448609 data_types_to_load = (
449610 'transcriptomics' ,
@@ -563,6 +724,22 @@ def format(
563724 remove_na : bool = False ,
564725 ** kwargs : dict ,
565726 ):
727+ """
728+ Format the dataset according to the specified type.
729+
730+ Parameters
731+ ----------
732+ data_type : str
733+ The type of data to format (e.g., 'transcriptomics', 'mutations').
734+ remove_na : bool, optional
735+ Whether to remove rows with missing values, by default False.
736+ **kwargs : dict
737+ Additional arguments for customization.
738+
739+ Returns
740+ -------
741+ Formatted data based on the requested type.
742+ """
566743
567744 if data_type == "transcriptomics" :
568745 if data .transcriptomics is None :
@@ -759,6 +936,31 @@ def split_train_other(
759936 random_state : Optional [Union [int ,RandomState ]]= None ,
760937 ** kwargs : dict ,
761938 ):
939+
940+ """
941+ Split the dataset into training and other subsets.
942+
943+ Parameters
944+ ----------
945+ split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
946+ The type of splitting to perform, by default 'mixed-set'.
947+ ratio : tuple[int, int], optional
948+ Ratio of train to other split sizes, by default (8, 2).
949+ stratify_by : str, optional
950+ Column to use for stratification, if any, by default None.
951+ balance : bool, optional
952+ Whether to balance the split data, by default False.
953+ random_state : int | RandomState | None, optional
954+ Random seed for reproducibility, by default None.
955+ **kwargs : dict
956+ Additional arguments for customization.
957+
958+ Returns
959+ -------
960+ TwoWaySplit
961+ The resulting datasets in training and other split.
962+ """
963+
762964 train , other = _split_two_way (
763965 data = data ,
764966 split_type = split_type ,
@@ -785,6 +987,31 @@ def split_train_test_validate(
785987 random_state : Optional [Union [int ,RandomState ]]= None ,
786988 ** kwargs : dict ,
787989 ) -> Split :
990+
991+ """
992+ Split the dataset into training, testing, and validation subsets.
993+
994+ Parameters
995+ ----------
996+ split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
997+ The type of splitting strategy to use, by default 'mixed-set'.
998+ ratio : tuple[int, int, int], optional
999+ Ratio for train, test, and validation sizes, by default (8,1,1).
1000+ stratify_by : str, optional
1001+ Column for stratification, if any, by default None.
1002+ balance : bool, optional
1003+ Whether to balance the splits, by default False.
1004+ random_state : int | RandomState | None, optional
1005+ Random seed for reproducible splits, by default None.
1006+ **kwargs : dict
1007+ Additional arguments for customization.
1008+
1009+ Returns
1010+ -------
1011+ Split
1012+ A Split object with train, test, and validation datasets.
1013+ """
1014+
7881015 # Type checking split_type
7891016 if split_type not in [
7901017 'mixed-set' , 'drug-blind' , 'cancer-blind'
0 commit comments