Merge pull request #460 from PNNL-CompBio/liver_fix

sgosline · web-flow · commit f07bbf93deec · 2025-11-24T10:36:02.000-08:00
Liver proteomics fix
diff --git a/coderbuild/liver/02-omics-liver.py b/coderbuild/liver/02-omics-liver.py
@@ -278,43 +278,60 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
 
 
 def map_proteomics(proteomics_data, improve_id_data, entrez_data):
-
-    # read in data
-    if isinstance(proteomics_data, pd.DataFrame) == False:
-        proteomics_data = pd.read_csv(proteomics_data)
-
-    if isinstance(improve_id_data, pd.DataFrame) == False:
+    if not isinstance(proteomics_data, pd.DataFrame):
+        # use header=1 (second line as header), drop first row which is a comment
+        proteomics_data = pd.read_csv(proteomics_data, header=1, index_col=0)
+    if not isinstance(improve_id_data, pd.DataFrame):
         improve_id_data = pd.read_csv(improve_id_data)
-
-    if isinstance(entrez_data, pd.DataFrame) == False:
+    if not isinstance(entrez_data, pd.DataFrame):
         entrez_data = pd.read_csv(entrez_data)
 
-    # first, replace colnames with first row and delete first row
-    proteomics_data.columns = proteomics_data.iloc[0,:]
-    proteomics_data = proteomics_data.iloc[1:]
-
-    # melt the df so there is one sample and prot per row
-    proteomics_data = proteomics_data.rename(columns = {proteomics_data.columns[0]:'gene_symbol'})
-    long_prot_df = pd.melt(proteomics_data, id_vars=['gene_symbol'], value_vars=proteomics_data.columns[proteomics_data.columns != 'gene_symbol'])
-    long_prot_df = long_prot_df.rename(columns = {0:'sample_name', 'value':'proteomics'})
+    # Clean column names
+    proteomics_data.columns = proteomics_data.columns.astype(str).str.strip()
+    proteomics_data = proteomics_data.rename(columns={"Sample \nGene symbol": "gene_symbol"})
 
-
-    # map gene names to entrez id's
-    mapped_proteomics_df = pd.merge(long_prot_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "gene_symbol", right_on= "other_id")
-    mapped_proteomics_df = mapped_proteomics_df.dropna(subset=['entrez_id'])
-
-    # mapping improve sample id'samples_df
-    mapped_proteomics_df = pd.merge(mapped_proteomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "sample_name", right_on= "other_id")
-
-    # clean up column names and data types
-    mapped_proteomics_df = mapped_proteomics_df.drop(columns=['gene_symbol','sample_name','other_id_x','other_id_y'])
-    mapped_proteomics_df['source'] = "Synapse"
-    mapped_proteomics_df['study'] = "liver"
-    mapped_proteomics_df = mapped_proteomics_df.dropna()
-    mapped_proteomics_df = mapped_proteomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
-    mapped_proteomics_df = mapped_proteomics_df[['entrez_id','proteomics','improve_sample_id','source','study']]
-
-    return(mapped_proteomics_df)
+    # Drop any rows with missing gene symbol and all are strings
+    proteomics_data = proteomics_data.dropna(subset=["gene_symbol"])
+    proteomics_data["gene_symbol"] = proteomics_data["gene_symbol"].astype(str).str.strip()
+    
+    value_cols = [c for c in proteomics_data.columns if c != "gene_symbol"]
+    long_prot_df = proteomics_data.melt(
+        id_vars=["gene_symbol"],
+        value_vars=value_cols,
+        var_name="sample_name",
+        value_name="proteomics"
+    )
+
+    #ensure strings
+    long_prot_df["gene_symbol"] = long_prot_df["gene_symbol"].astype(str).str.strip()
+    entrez_data["other_id"] = entrez_data["other_id"].astype(str).str.strip()
+
+    #Two merges
+    mapped_proteomics_df = pd.merge(
+        long_prot_df,
+        entrez_data[["other_id", "entrez_id"]].drop_duplicates(),
+        how="inner",
+        left_on="gene_symbol",
+        right_on="other_id"
+    )
+
+    improve_id_data["other_id"] = improve_id_data["other_id"].astype(str).str.strip()
+    mapped_proteomics_df = pd.merge(
+        mapped_proteomics_df,
+        improve_id_data[["other_id", "improve_sample_id"]].drop_duplicates(),
+        how="inner",
+        left_on="sample_name",
+        right_on="other_id"
+    )
+
+    mapped_proteomics_df = mapped_proteomics_df.drop(columns=["other_id_x", "other_id_y", "gene_symbol"])
+    mapped_proteomics_df["source"] = "Synapse"
+    mapped_proteomics_df["study"] = "liver"
+    mapped_proteomics_df = mapped_proteomics_df.dropna(subset=["entrez_id", "improve_sample_id"])
+    mapped_proteomics_df = mapped_proteomics_df.astype({"entrez_id": "int", "improve_sample_id": "int"})
+    mapped_proteomics_df = mapped_proteomics_df[["entrez_id", "proteomics", "improve_sample_id", "source", "study"]]
+
+    return mapped_proteomics_df
 
 
 if __name__ == "__main__":
diff --git a/coderbuild/utils/calc_pdx_metrics.py b/coderbuild/utils/calc_pdx_metrics.py
@@ -249,7 +249,6 @@ def ABC(contr_time=None, contr_volume=None, treat_time=None, treat_volume=None):
     return {"metric": "abc", "value": abc,'time':np.max(treat_time)}#, "control": con, "treatment": tre}
 
 
-###LMM CODE
 def lmm(time, volume, treatment, drug_name):
     """
     Compute the linear mixed model (lmm) statistics for a PDX batch.
@@ -266,7 +265,16 @@ def lmm(time, volume, treatment, drug_name):
                          'time':time,\
                           'exp_type':treatment})
 
-    data = data.dropna()
+    
+    data['log_volume'] = np.log(data['volume'])
+    n_nonfinite_log = (~np.isfinite(data['log_volume'])).sum()
+
+    # categories
+    data['exp_type'] = data['exp_type'].astype('category')
+    data['exp_type'] = pd.Categorical(data['exp_type'],
+                                    categories=['control', drug_name],
+                                    ordered=True)
+
                 
     ##create data frame from these 4 vectors
     required_columns = ["model_id", "volume", "time", "exp_type"]
@@ -286,14 +294,18 @@ def lmm(time, volume, treatment, drug_name):
     #print(data['exp_type'].cat.categories)
     # Fit the model
     model = mixedlm(formula, data, groups=data['model_id'])
+
+
     fit = model.fit()
     
+    
     # Get the coefficient for the interaction term 'time:exp_type'
     #interaction_term = 'time:exp_type'
 #    if interaction_term in fit.params:
 #    time_coef_value = fit.params['time']
     #print(fit.params)
     i_coef_value = fit.params['time:exp_type[T.'+drug_name+']']
+    
     #i_coef_value = fit.params['time:exp_type['+drug_name+']']
    # else:
    #     coef_value = None  # Handle the case when the interaction term is not present
@@ -384,9 +396,9 @@ def get_drug_stats(df, control='control'):
             else:
                 singleres.append(treat_abc)
 
-            #llm
             comb = pd.concat([ctl_data, d_data])
-            #print(comb)
+
+
             lmm_res = lmm(comb.time, comb.volume, comb.treatment, d)
             lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'})
             if '+' in d: