Merge pull request #49 from ncats/development

Development
ncats · Nov 26, 2021 · e1604b1 · e1604b1
2 parents 9440263 + 65272cb
commit e1604b1
Showing 8 changed files with 37 additions and 15 deletions.
diff --git a/server/app.py b/server/app.py
@@ -193,7 +193,7 @@ def predict_df(df, smi_column_name, models):
 
         pred_df = predictor.get_predictions()
         if data_path != '':
-            predictor.record_predictions(f'{data_path}predictions.csv')
+            predictor.record_predictions(f'{data_path}/predictions.csv')
         pred_df = working_df.join(pred_df)
         pred_df.drop(['mols', 'kekule_smiles'], axis=1, inplace=True)
 

diff --git a/server/default.profraw b/server/default.profraw
diff --git a/server/predictors/base/gcnn.py b/server/predictors/base/gcnn.py
@@ -81,6 +81,7 @@ def gcnn_predict(self, model, scaler) -> Tuple[array, array]:
             dt = datetime.datetime.now(timezone.utc)
             utc_time = dt.replace(tzinfo=timezone.utc)
             utc_timestamp = utc_time.timestamp()
+
             self.raw_predictions_df = self.raw_predictions_df.append(
                 pd.DataFrame(
                     { 'SMILES': self.smiles, 'model': self.model_name, 'prediction': predictions, 'timestamp': utc_timestamp }
@@ -92,5 +93,5 @@ def gcnn_predict(self, model, scaler) -> Tuple[array, array]:
         if len(self.predictions_df.index) > len(predictions) or np.ma.count_masked(predictions) > 0:
             self.model_errors.append('graph convolutional neural network')
             self.has_errors = True
-        
-        return predictions, labels
+
+        return predictions, labels
diff --git a/server/predictors/cyp450/cyp450_predictor.py b/server/predictors/cyp450/cyp450_predictor.py
@@ -184,7 +184,7 @@ def get_predictions(self):
             pool.join()
 
         end = time.time()
-        print(f'{end - start} seconds to CYP450 predict {len(self.predictions_df.index)} molecules')
+        print(f'CYP450: {end - start} seconds to predict {len(self.predictions_df.index)} molecules')
 
         return self.predictions_df
 

diff --git a/server/predictors/liver_cytosol/lc_predictor.py b/server/predictors/liver_cytosol/lc_predictor.py
@@ -65,7 +65,6 @@ def __init__(self, kekule_smiles: array = None, morgan_fp: array = None, smiles:
         self.predictions_df = pd.DataFrame(columns=columns)
         self.raw_predictions_df = pd.DataFrame()
 
-        print('Generating Morgan FPs...')
         desc_gen = DescriptorGen()
         kekule_smiles_df['desc'] = kekule_smiles_df['kekule_smiles'].apply(desc_gen.from_smiles)
         self.morgan_fp = np.stack(kekule_smiles_df.desc)
@@ -84,15 +83,30 @@ def get_predictions(self):
             pred_probs = model.predict_proba(features).T[1]
             self.raw_predictions_df[model_name] =  pred_probs
 
-        end = time.time()
-        print(f'{end - start} seconds to HLC predict {len(self.raw_predictions_df.index)} molecules')
-
         self.raw_predictions_df['average'] = self.raw_predictions_df.mean(axis=1)
         avg_pred_probs = self.raw_predictions_df['average'].tolist()
         #self.predictions_df['Predicted Class (Probability)'] = pd.Series(pd.Series(avg_pred_probs).round().astype(int).astype(str) + ' (' + pd.Series(avg_pred_probs).round(2).astype(str) + ')')
         self.predictions_df['Predicted Class (Probability)'] = pd.Series(pd.Series(avg_pred_probs).round().astype(int).astype(str) + ' (' + pd.Series(np.where(np.asarray(avg_pred_probs)>=0.5, np.asarray(avg_pred_probs), (1-np.asarray(avg_pred_probs)))).round(2).astype(str) + ')')
         self.predictions_df['Prediction'] = pd.Series(pd.Series(np.where(np.asarray(avg_pred_probs)>=0.5, 'unstable', 'stable')))
 
+        # empyting the raw df
+        self.raw_predictions_df = pd.DataFrame(None)
+
+        # populate raw df for recording preds
+        if self.smiles is not None:
+            dt = datetime.datetime.now(timezone.utc)
+            utc_time = dt.replace(tzinfo=timezone.utc)
+            utc_timestamp = utc_time.timestamp()
+            self.raw_predictions_df = self.raw_predictions_df.append(
+                pd.DataFrame(
+                    { 'SMILES': self.smiles, 'model': 'hlc', 'prediction': avg_pred_probs, 'timestamp': utc_timestamp }
+                ),
+                ignore_index = True
+            )
+
+        end = time.time()
+        print(f'HLC: {end - start} seconds to predict {len(self.raw_predictions_df.index)} molecules')
+
         return self.predictions_df
 
     def _error_callback(self, error):
@@ -105,3 +119,10 @@ def get_errors(self):
 
     def columns_dict(self):
         return self._columns_dict.copy()
+
+    def record_predictions(self, file_path):
+        if len(self.raw_predictions_df.index) > 0:
+            with open(file_path, 'a') as fw:
+                rows = self.raw_predictions_df.values.tolist()
+                cw = csv.writer(fw)
+                cw.writerows(rows)
diff --git a/server/predictors/pampa50/pampa_predictor.py b/server/predictors/pampa50/pampa_predictor.py
@@ -53,7 +53,7 @@ def __init__(self, kekule_smiles: array = None, smiles: array = None):
         }
 
         self.model_name = 'pampa50'
-        
+
     def get_predictions(self) -> DataFrame:
         """
         Function that calculates consensus predictions
@@ -67,10 +67,10 @@ def get_predictions(self) -> DataFrame:
             start = time.time()
             gcnn_predictions, gcnn_labels = self.gcnn_predict(pampa_gcnn_model, pampa_gcnn_scaler)
             end = time.time()
-            print(f'{end - start} seconds to PAMPA predict {len(self.predictions_df.index)} molecules')
+            print(f'PAMPA 5.0: {end - start} seconds to predict {len(self.predictions_df.index)} molecules')
 
             self.predictions_df['Prediction'] = pd.Series(
                 pd.Series(np.where(gcnn_predictions>=0.5, 'low permeability', 'moderate or high permeability'))
             )
-            
+
         return self.predictions_df
diff --git a/server/predictors/rlm/rlm_predictor.py b/server/predictors/rlm/rlm_predictor.py
@@ -67,7 +67,7 @@ def get_predictions(self) -> DataFrame:
             start = time.time()
             gcnn_predictions, gcnn_labels = self.gcnn_predict(rlm_gcnn_model, rlm_gcnn_scaler)
             end = time.time()
-            print(f'{end - start} seconds to RLM predict {len(self.predictions_df.index)} molecules')
+            print(f'RLM: {end - start} seconds to predict {len(self.predictions_df.index)} molecules')
 
             self.predictions_df['Prediction'] = pd.Series(
                 pd.Series(np.where(gcnn_predictions>=0.5, 'unstable', 'stable'))

diff --git a/server/predictors/solubility/solubility_predictor.py b/server/predictors/solubility/solubility_predictor.py
@@ -53,7 +53,7 @@ def __init__(self, kekule_smiles: array = None, smiles: array = None):
         }
 
         self.model_name = 'solubility'
-        
+
     def get_predictions(self) -> DataFrame:
         """
         Function that calculates consensus predictions
@@ -67,10 +67,10 @@ def get_predictions(self) -> DataFrame:
             start = time.time()
             gcnn_predictions, gcnn_labels = self.gcnn_predict(solubility_gcnn_model, solubility_gcnn_scaler)
             end = time.time()
-            print(f'{end - start} seconds to Solubility predict {len(self.predictions_df.index)} molecules')
+            print(f'Solubility: {end - start} seconds to predict {len(self.predictions_df.index)} molecules')
 
             self.predictions_df['Prediction'] = pd.Series(
                 pd.Series(np.where(gcnn_predictions>=0.5, 'low solubility', 'high solubility'))
             )
-            
+
         return self.predictions_df