Skip to content

Commit 6dd5857

Browse files
authored
Merge pull request #451 from PNNL-CompBio/390-update-improve-wrapper-to-output-proteomics
improvements to improve wrapper
2 parents 17392c1 + aae9160 commit 6dd5857

File tree

1 file changed

+129
-9
lines changed

1 file changed

+129
-9
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 129 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,13 @@ def main():
8282
type=int,
8383
default=10
8484
)
85+
p_process_datasets.add_argument(
86+
'-b', '--balance_by', dest='BALANCE_BY',
87+
choices=['auc', 'fit_auc'],
88+
default=None,
89+
help="Defines if and using which drug response metric the splits "
90+
"should be balanced by."
91+
)
8592
p_process_datasets.add_argument(
8693
'-r', '--random_seeds', dest='RANDOM_SEEDS',
8794
type=_random_seed_list,
@@ -165,8 +172,16 @@ def process_datasets(args):
165172
experiments = []
166173
logger.debug("creating list of datasets that contain experiment info ...")
167174
for data_set in data_sets_names_list:
168-
# sarcpdo has different drug response values
169-
if data_set == 'sarcpdo':
175+
experiments_raw = data_sets[data_set].experiments
176+
177+
# Some datasets don't have drug response data (the experiments
178+
# table)
179+
if experiments_raw is None:
180+
logger.debug(f"NO experiment data for {data_set}")
181+
182+
183+
# Logic for datasets containing "published_auc" but not "auc"
184+
elif experiments_raw['dose_response_metric'].isin(['published_auc']).any():
170185
experiment = data_sets[data_set].format(
171186
data_type='experiments',
172187
shape='wide',
@@ -176,8 +191,29 @@ def process_datasets(args):
176191
)
177192
experiment.rename(columns={'published_auc': 'auc'}, inplace=True)
178193
experiments.append(experiment)
179-
# not all Datasets have experiments / drug response data
180-
elif data_sets[data_set].experiments is not None:
194+
195+
# Logic for PDX datasets that don't have `auc` but mRECIST (note
196+
# the typo currently in the `drugresponse_metric` column).
197+
elif experiments_raw['dose_response_metric'].isin(['mRESCIST']).any():
198+
experiment = data_sets[data_set].format(
199+
data_type='experiments',
200+
shape='wide',
201+
metrics=[
202+
'mRESCIST',
203+
],
204+
)
205+
# conversion logic from mRECIST -> auc
206+
experiment.loc[experiment['mRESCIST'] == 'CR', 'mRESCIST'] = 0.1
207+
experiment.loc[experiment['mRESCIST'] == 'PR', 'mRESCIST'] = 0.2
208+
experiment.loc[experiment['mRESCIST'] == 'SD', 'mRESCIST'] = 0.5
209+
experiment.loc[experiment['mRESCIST'] == 'PD', 'mRESCIST'] = 1.0
210+
211+
experiment.rename(columns={'mRESCIST': 'auc'}, inplace=True)
212+
experiments.append(experiment)
213+
214+
# The remaining datasets should have `auc` as
215+
# drug_response_metric available in the `experiments` table
216+
else:
181217
logger.debug(f"experiment data found for {data_set}")
182218
# formatting existing response data to wide
183219
experiment = data_sets[data_set].format(
@@ -196,8 +232,6 @@ def process_datasets(args):
196232
],
197233
)
198234
experiments.append(experiment.dropna())
199-
else:
200-
logger.debug(f"NO experiment data for {data_set}")
201235

202236
# concatenating existing response data and "clean up"
203237
logger.debug("concatenating experiment data ...")
@@ -326,7 +360,82 @@ def process_datasets(args):
326360
)
327361
)
328362

363+
#-------------------------------------------------------------------
364+
# create proteomics master table
365+
#-------------------------------------------------------------------
329366

367+
proteomics = merge_master_tables(
368+
args=args,
369+
data_sets=data_sets,
370+
data_type='proteomics'
371+
)
372+
373+
####
374+
# Imputation step:
375+
# currently we are imputing by generating the mean over all samples
376+
# in wich the protein was detected across all datasets.
377+
# The missing values are the back filled for each protein.
378+
####
379+
proteomics = (
380+
proteomics
381+
# the proteomics table has the transposed first (see below)
382+
# due to .fillna not working as expected with axis==1
383+
.T
384+
.fillna(
385+
# the filling of NAs with 'value' is not implemented for
386+
# axis==1, despite what is documented for pandas>2.0.0
387+
value=proteomics.median(axis=1, skipna=True),
388+
axis=0
389+
)
390+
.T # transpose back into original orientation
391+
)
392+
# merging ensemble gene id & gene symbol into the proteomics
393+
# data
394+
proteomics = pd.merge(
395+
proteomics,
396+
data_gene_names[[
397+
'entrez_id',
398+
'ensembl_gene_id',
399+
'gene_symbol'
400+
]],
401+
how='left',
402+
on='entrez_id',
403+
)
404+
405+
# moving ensemble_id & gene_symbol columns to the front of the table
406+
# such that when transposing the DataFrame they are row 3 and 2
407+
# respectively
408+
proteomics.insert(
409+
1,
410+
'gene_symbol',
411+
proteomics.pop('gene_symbol')
412+
)
413+
proteomics.insert(
414+
0,
415+
'ensembl_gene_id',
416+
proteomics.pop('ensembl_gene_id')
417+
)
418+
419+
proteomics = proteomics[proteomics['entrez_id'] != 0]
420+
proteomics = proteomics.fillna(0).T.reset_index()
421+
for i in range(0,3):
422+
proteomics.iloc[i,0] = np.nan
423+
424+
# writing the proteomics datatable to '/x_data/*_proteomics.tsv'
425+
outfile_path = args.WORKDIR.joinpath(
426+
"data_out",
427+
"x_data",
428+
"cancer_proteomics.tsv"
429+
)
430+
(proteomics
431+
.to_csv(
432+
path_or_buf=outfile_path,
433+
sep='\t',
434+
header=False,
435+
index=False
436+
)
437+
)
438+
330439
#-------------------------------------------------------------------
331440
# create copynumber master table & discretized table
332441
#-------------------------------------------------------------------
@@ -688,13 +797,21 @@ def split_data_sets(
688797
args: dict,
689798
data_sets: dict,
690799
data_sets_names: list,
691-
response_data: pd.DataFrame
800+
response_data: pd.DataFrame,
692801
):
693802

694803
splits_folder = args.WORKDIR.joinpath('data_out', 'splits')
695804
split_type = args.SPLIT_TYPE
696805
ratio = (8,1,1)
697-
stratify_by = None
806+
stratify_by = args.BALANCE_BY
807+
if stratify_by is not None:
808+
balance = True
809+
quantiles = False
810+
num_classes = 4
811+
else:
812+
balance = False
813+
quantiles = True
814+
num_classes = 4
698815
if args.RANDOM_SEEDS is not None:
699816
random_seeds = args.RANDOM_SEEDS
700817
else:
@@ -743,6 +860,9 @@ def split_data_sets(
743860
split_type=split_type,
744861
ratio=ratio,
745862
stratify_by=stratify_by,
863+
balance=balance,
864+
quantiles=quantiles,
865+
num_classes=num_classes,
746866
random_state=random_seeds[i]
747867
)
748868
train_keys = (
@@ -869,7 +989,7 @@ def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
869989
for data_set in data_sets:
870990
if data_sets[data_set].experiments is not None:
871991
if (
872-
data_type in ['transcriptomics', 'copy_number'] and
992+
data_type in ['transcriptomics', 'copy_number', 'proteomics'] and
873993
getattr(data_sets[data_set], data_type, None) is not None
874994
):
875995
dfs_to_merge.append(

0 commit comments

Comments
 (0)