Skip to content

Commit f07bbf9

Browse files
authored
Merge pull request #460 from PNNL-CompBio/liver_fix
Liver proteomics fix
2 parents 5cbe874 + fdab0ae commit f07bbf9

File tree

2 files changed

+66
-37
lines changed

2 files changed

+66
-37
lines changed

coderbuild/liver/02-omics-liver.py

Lines changed: 50 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -278,43 +278,60 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
278278

279279

280280
def map_proteomics(proteomics_data, improve_id_data, entrez_data):
281-
282-
# read in data
283-
if isinstance(proteomics_data, pd.DataFrame) == False:
284-
proteomics_data = pd.read_csv(proteomics_data)
285-
286-
if isinstance(improve_id_data, pd.DataFrame) == False:
281+
if not isinstance(proteomics_data, pd.DataFrame):
282+
# use header=1 (second line as header), drop first row which is a comment
283+
proteomics_data = pd.read_csv(proteomics_data, header=1, index_col=0)
284+
if not isinstance(improve_id_data, pd.DataFrame):
287285
improve_id_data = pd.read_csv(improve_id_data)
288-
289-
if isinstance(entrez_data, pd.DataFrame) == False:
286+
if not isinstance(entrez_data, pd.DataFrame):
290287
entrez_data = pd.read_csv(entrez_data)
291288

292-
# first, replace colnames with first row and delete first row
293-
proteomics_data.columns = proteomics_data.iloc[0,:]
294-
proteomics_data = proteomics_data.iloc[1:]
295-
296-
# melt the df so there is one sample and prot per row
297-
proteomics_data = proteomics_data.rename(columns = {proteomics_data.columns[0]:'gene_symbol'})
298-
long_prot_df = pd.melt(proteomics_data, id_vars=['gene_symbol'], value_vars=proteomics_data.columns[proteomics_data.columns != 'gene_symbol'])
299-
long_prot_df = long_prot_df.rename(columns = {0:'sample_name', 'value':'proteomics'})
289+
# Clean column names
290+
proteomics_data.columns = proteomics_data.columns.astype(str).str.strip()
291+
proteomics_data = proteomics_data.rename(columns={"Sample \nGene symbol": "gene_symbol"})
300292

301-
302-
# map gene names to entrez id's
303-
mapped_proteomics_df = pd.merge(long_prot_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "gene_symbol", right_on= "other_id")
304-
mapped_proteomics_df = mapped_proteomics_df.dropna(subset=['entrez_id'])
305-
306-
# mapping improve sample id'samples_df
307-
mapped_proteomics_df = pd.merge(mapped_proteomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "sample_name", right_on= "other_id")
308-
309-
# clean up column names and data types
310-
mapped_proteomics_df = mapped_proteomics_df.drop(columns=['gene_symbol','sample_name','other_id_x','other_id_y'])
311-
mapped_proteomics_df['source'] = "Synapse"
312-
mapped_proteomics_df['study'] = "liver"
313-
mapped_proteomics_df = mapped_proteomics_df.dropna()
314-
mapped_proteomics_df = mapped_proteomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
315-
mapped_proteomics_df = mapped_proteomics_df[['entrez_id','proteomics','improve_sample_id','source','study']]
316-
317-
return(mapped_proteomics_df)
293+
# Drop any rows with missing gene symbol and all are strings
294+
proteomics_data = proteomics_data.dropna(subset=["gene_symbol"])
295+
proteomics_data["gene_symbol"] = proteomics_data["gene_symbol"].astype(str).str.strip()
296+
297+
value_cols = [c for c in proteomics_data.columns if c != "gene_symbol"]
298+
long_prot_df = proteomics_data.melt(
299+
id_vars=["gene_symbol"],
300+
value_vars=value_cols,
301+
var_name="sample_name",
302+
value_name="proteomics"
303+
)
304+
305+
#ensure strings
306+
long_prot_df["gene_symbol"] = long_prot_df["gene_symbol"].astype(str).str.strip()
307+
entrez_data["other_id"] = entrez_data["other_id"].astype(str).str.strip()
308+
309+
#Two merges
310+
mapped_proteomics_df = pd.merge(
311+
long_prot_df,
312+
entrez_data[["other_id", "entrez_id"]].drop_duplicates(),
313+
how="inner",
314+
left_on="gene_symbol",
315+
right_on="other_id"
316+
)
317+
318+
improve_id_data["other_id"] = improve_id_data["other_id"].astype(str).str.strip()
319+
mapped_proteomics_df = pd.merge(
320+
mapped_proteomics_df,
321+
improve_id_data[["other_id", "improve_sample_id"]].drop_duplicates(),
322+
how="inner",
323+
left_on="sample_name",
324+
right_on="other_id"
325+
)
326+
327+
mapped_proteomics_df = mapped_proteomics_df.drop(columns=["other_id_x", "other_id_y", "gene_symbol"])
328+
mapped_proteomics_df["source"] = "Synapse"
329+
mapped_proteomics_df["study"] = "liver"
330+
mapped_proteomics_df = mapped_proteomics_df.dropna(subset=["entrez_id", "improve_sample_id"])
331+
mapped_proteomics_df = mapped_proteomics_df.astype({"entrez_id": "int", "improve_sample_id": "int"})
332+
mapped_proteomics_df = mapped_proteomics_df[["entrez_id", "proteomics", "improve_sample_id", "source", "study"]]
333+
334+
return mapped_proteomics_df
318335

319336

320337
if __name__ == "__main__":

coderbuild/utils/calc_pdx_metrics.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,6 @@ def ABC(contr_time=None, contr_volume=None, treat_time=None, treat_volume=None):
249249
return {"metric": "abc", "value": abc,'time':np.max(treat_time)}#, "control": con, "treatment": tre}
250250

251251

252-
###LMM CODE
253252
def lmm(time, volume, treatment, drug_name):
254253
"""
255254
Compute the linear mixed model (lmm) statistics for a PDX batch.
@@ -266,7 +265,16 @@ def lmm(time, volume, treatment, drug_name):
266265
'time':time,\
267266
'exp_type':treatment})
268267

269-
data = data.dropna()
268+
269+
data['log_volume'] = np.log(data['volume'])
270+
n_nonfinite_log = (~np.isfinite(data['log_volume'])).sum()
271+
272+
# categories
273+
data['exp_type'] = data['exp_type'].astype('category')
274+
data['exp_type'] = pd.Categorical(data['exp_type'],
275+
categories=['control', drug_name],
276+
ordered=True)
277+
270278

271279
##create data frame from these 4 vectors
272280
required_columns = ["model_id", "volume", "time", "exp_type"]
@@ -286,14 +294,18 @@ def lmm(time, volume, treatment, drug_name):
286294
#print(data['exp_type'].cat.categories)
287295
# Fit the model
288296
model = mixedlm(formula, data, groups=data['model_id'])
297+
298+
289299
fit = model.fit()
290300

301+
291302
# Get the coefficient for the interaction term 'time:exp_type'
292303
#interaction_term = 'time:exp_type'
293304
# if interaction_term in fit.params:
294305
# time_coef_value = fit.params['time']
295306
#print(fit.params)
296307
i_coef_value = fit.params['time:exp_type[T.'+drug_name+']']
308+
297309
#i_coef_value = fit.params['time:exp_type['+drug_name+']']
298310
# else:
299311
# coef_value = None # Handle the case when the interaction term is not present
@@ -384,9 +396,9 @@ def get_drug_stats(df, control='control'):
384396
else:
385397
singleres.append(treat_abc)
386398

387-
#llm
388399
comb = pd.concat([ctl_data, d_data])
389-
#print(comb)
400+
401+
390402
lmm_res = lmm(comb.time, comb.volume, comb.treatment, d)
391403
lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
392404
if '+' in d:

0 commit comments

Comments
 (0)