@@ -82,6 +82,13 @@ def main():
8282 type = int ,
8383 default = 10
8484 )
85+ p_process_datasets .add_argument (
86+ '-b' , '--balance_by' , dest = 'BALANCE_BY' ,
87+ choices = ['auc' , 'fit_auc' ],
88+ default = None ,
89+ help = "Defines if and using which drug response metric the splits "
90+ "should be balanced by."
91+ )
8592 p_process_datasets .add_argument (
8693 '-r' , '--random_seeds' , dest = 'RANDOM_SEEDS' ,
8794 type = _random_seed_list ,
@@ -165,8 +172,16 @@ def process_datasets(args):
165172 experiments = []
166173 logger .debug ("creating list of datasets that contain experiment info ..." )
167174 for data_set in data_sets_names_list :
168- # sarcpdo has different drug response values
169- if data_set == 'sarcpdo' :
175+ experiments_raw = data_sets [data_set ].experiments
176+
177+ # Some datasets don't have drug response data (the experiments
178+ # table)
179+ if experiments_raw is None :
180+ logger .debug (f"NO experiment data for { data_set } " )
181+
182+
183+ # Logic for datasets containing "published_auc" but not "auc"
184+ elif experiments_raw ['dose_response_metric' ].isin (['published_auc' ]).any ():
170185 experiment = data_sets [data_set ].format (
171186 data_type = 'experiments' ,
172187 shape = 'wide' ,
@@ -176,8 +191,29 @@ def process_datasets(args):
176191 )
177192 experiment .rename (columns = {'published_auc' : 'auc' }, inplace = True )
178193 experiments .append (experiment )
179- # not all Datasets have experiments / drug response data
180- elif data_sets [data_set ].experiments is not None :
194+
195+ # Logic for PDX datasets that don't have `auc` but mRECIST (note
196+ # the typo currently in the `drugresponse_metric` column).
197+ elif experiments_raw ['dose_response_metric' ].isin (['mRESCIST' ]).any ():
198+ experiment = data_sets [data_set ].format (
199+ data_type = 'experiments' ,
200+ shape = 'wide' ,
201+ metrics = [
202+ 'mRESCIST' ,
203+ ],
204+ )
205+ # conversion logic from mRECIST -> auc
206+ experiment .loc [experiment ['mRESCIST' ] == 'CR' , 'mRESCIST' ] = 0.1
207+ experiment .loc [experiment ['mRESCIST' ] == 'PR' , 'mRESCIST' ] = 0.2
208+ experiment .loc [experiment ['mRESCIST' ] == 'SD' , 'mRESCIST' ] = 0.5
209+ experiment .loc [experiment ['mRESCIST' ] == 'PD' , 'mRESCIST' ] = 1.0
210+
211+ experiment .rename (columns = {'mRESCIST' : 'auc' }, inplace = True )
212+ experiments .append (experiment )
213+
214+ # The remaining datasets should have `auc` as
215+ # drug_response_metric available in the `experiments` table
216+ else :
181217 logger .debug (f"experiment data found for { data_set } " )
182218 # formatting existing response data to wide
183219 experiment = data_sets [data_set ].format (
@@ -196,8 +232,6 @@ def process_datasets(args):
196232 ],
197233 )
198234 experiments .append (experiment .dropna ())
199- else :
200- logger .debug (f"NO experiment data for { data_set } " )
201235
202236 # concatenating existing response data and "clean up"
203237 logger .debug ("concatenating experiment data ..." )
@@ -326,7 +360,82 @@ def process_datasets(args):
326360 )
327361 )
328362
363+ #-------------------------------------------------------------------
364+ # create proteomics master table
365+ #-------------------------------------------------------------------
329366
367+ proteomics = merge_master_tables (
368+ args = args ,
369+ data_sets = data_sets ,
370+ data_type = 'proteomics'
371+ )
372+
373+ ####
374+ # Imputation step:
375+ # currently we are imputing by generating the mean over all samples
376+ # in wich the protein was detected across all datasets.
377+ # The missing values are the back filled for each protein.
378+ ####
379+ proteomics = (
380+ proteomics
381+ # the proteomics table has the transposed first (see below)
382+ # due to .fillna not working as expected with axis==1
383+ .T
384+ .fillna (
385+ # the filling of NAs with 'value' is not implemented for
386+ # axis==1, despite what is documented for pandas>2.0.0
387+ value = proteomics .median (axis = 1 , skipna = True ),
388+ axis = 0
389+ )
390+ .T # transpose back into original orientation
391+ )
392+ # merging ensemble gene id & gene symbol into the proteomics
393+ # data
394+ proteomics = pd .merge (
395+ proteomics ,
396+ data_gene_names [[
397+ 'entrez_id' ,
398+ 'ensembl_gene_id' ,
399+ 'gene_symbol'
400+ ]],
401+ how = 'left' ,
402+ on = 'entrez_id' ,
403+ )
404+
405+ # moving ensemble_id & gene_symbol columns to the front of the table
406+ # such that when transposing the DataFrame they are row 3 and 2
407+ # respectively
408+ proteomics .insert (
409+ 1 ,
410+ 'gene_symbol' ,
411+ proteomics .pop ('gene_symbol' )
412+ )
413+ proteomics .insert (
414+ 0 ,
415+ 'ensembl_gene_id' ,
416+ proteomics .pop ('ensembl_gene_id' )
417+ )
418+
419+ proteomics = proteomics [proteomics ['entrez_id' ] != 0 ]
420+ proteomics = proteomics .fillna (0 ).T .reset_index ()
421+ for i in range (0 ,3 ):
422+ proteomics .iloc [i ,0 ] = np .nan
423+
424+ # writing the proteomics datatable to '/x_data/*_proteomics.tsv'
425+ outfile_path = args .WORKDIR .joinpath (
426+ "data_out" ,
427+ "x_data" ,
428+ "cancer_proteomics.tsv"
429+ )
430+ (proteomics
431+ .to_csv (
432+ path_or_buf = outfile_path ,
433+ sep = '\t ' ,
434+ header = False ,
435+ index = False
436+ )
437+ )
438+
330439 #-------------------------------------------------------------------
331440 # create copynumber master table & discretized table
332441 #-------------------------------------------------------------------
@@ -688,13 +797,21 @@ def split_data_sets(
688797 args : dict ,
689798 data_sets : dict ,
690799 data_sets_names : list ,
691- response_data : pd .DataFrame
800+ response_data : pd .DataFrame ,
692801 ):
693802
694803 splits_folder = args .WORKDIR .joinpath ('data_out' , 'splits' )
695804 split_type = args .SPLIT_TYPE
696805 ratio = (8 ,1 ,1 )
697- stratify_by = None
806+ stratify_by = args .BALANCE_BY
807+ if stratify_by is not None :
808+ balance = True
809+ quantiles = False
810+ num_classes = 4
811+ else :
812+ balance = False
813+ quantiles = True
814+ num_classes = 4
698815 if args .RANDOM_SEEDS is not None :
699816 random_seeds = args .RANDOM_SEEDS
700817 else :
@@ -743,6 +860,9 @@ def split_data_sets(
743860 split_type = split_type ,
744861 ratio = ratio ,
745862 stratify_by = stratify_by ,
863+ balance = balance ,
864+ quantiles = quantiles ,
865+ num_classes = num_classes ,
746866 random_state = random_seeds [i ]
747867 )
748868 train_keys = (
@@ -869,7 +989,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
869989 for data_set in data_sets :
870990 if data_sets [data_set ].experiments is not None :
871991 if (
872- data_type in ['transcriptomics' , 'copy_number' ] and
992+ data_type in ['transcriptomics' , 'copy_number' , 'proteomics' ] and
873993 getattr (data_sets [data_set ], data_type , None ) is not None
874994 ):
875995 dfs_to_merge .append (
0 commit comments