Skip to content

Commit 0140de5

Browse files
authored
Merge pull request #131 from molgenis/feat/update_libraries
feat: Updated libraries
2 parents 6e180a6 + 6594a1a commit 0140de5

File tree

6 files changed

+48
-21
lines changed

6 files changed

+48
-21
lines changed

scripts/balance_dataset.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -209,21 +209,22 @@ def split(dataset: pd.DataFrame):
209209
all_benign._is_copy = None
210210
v_benign_samples = all_benign.sample(frac=0.1, random_state=__random_state__)
211211
# A bit cryptic to remove the random samples from the benign dataset, but it works
212-
all_benign = all_benign.append(v_benign_samples)
212+
all_benign = pd.concat([all_benign, v_benign_samples], axis=0, ignore_index=True)
213213
all_benign.drop_duplicates(keep=False, inplace=True)
214-
return_dataset = return_dataset.append(all_benign, ignore_index=True)
215-
validation_dataset = validation_dataset.append(v_benign_samples, ignore_index=True)
214+
return_dataset = pd.concat([return_dataset, all_benign], axis=0, ignore_index=True)
215+
validation_dataset = pd.concat([validation_dataset, v_benign_samples], axis=0,
216+
ignore_index=True)
216217

217218
# Pathogenic
218219
all_pathogenic = dataset[dataset['binarized_label'] == 1]
219220
all_pathogenic._is_copy = None
220221
v_patho_samples = all_pathogenic.sample(frac=0.1, random_state=__random_state__)
221222
# Again a cryptic way to remove the randomly samples pathogenic samples
222-
all_pathogenic = all_pathogenic.append(v_patho_samples)
223+
all_pathogenic = pd.concat([all_pathogenic, v_patho_samples], axis=0, ignore_index=True)
223224
all_pathogenic.drop_duplicates(keep=False, inplace=True)
224-
return_dataset = return_dataset.append(all_pathogenic, ignore_index=True)
225-
validation_dataset = validation_dataset.append(v_patho_samples, ignore_index=True)
226-
225+
return_dataset = pd.concat([return_dataset, all_pathogenic], axis=0, ignore_index=True)
226+
validation_dataset = pd.concat([validation_dataset, v_patho_samples], axis=0,
227+
ignore_index=True)
227228
return validation_dataset, return_dataset
228229

229230

@@ -247,7 +248,12 @@ def balance(self, dataset: pd.DataFrame):
247248
processed_consequence = self._process_consequence(
248249
pathogenic_dataset=selected_pathogenic, benign_dataset=selected_benign
249250
)
250-
return_dataset = return_dataset.append(processed_consequence)
251+
return_dataset = pd.concat(
252+
[
253+
return_dataset,
254+
processed_consequence
255+
], axis=0, ignore_index=True
256+
)
251257
return return_dataset
252258

253259
def _process_consequence(self, pathogenic_dataset, benign_dataset):
@@ -267,10 +273,13 @@ def _process_consequence(self, pathogenic_dataset, benign_dataset):
267273
lower_bound = bins[ind]
268274
upper_bound = bins[ind + 1]
269275
sample_number = pathogenic_histogram[ind]
270-
processed_bins = processed_bins.append(
271-
self._process_bins(
272-
pathogenic_dataset, benign_dataset, upper_bound, lower_bound, sample_number
273-
)
276+
processed_bins = pd.concat(
277+
[
278+
processed_bins,
279+
self._process_bins(
280+
pathogenic_dataset, benign_dataset, upper_bound, lower_bound, sample_number
281+
)
282+
], axis=0, ignore_index=True
274283
)
275284
return processed_bins
276285

@@ -295,7 +304,9 @@ def _process_bins(
295304
selected_benign.shape[0],
296305
random_state=__random_state__
297306
)
298-
return return_benign.append(return_pathogenic, ignore_index=True)
307+
return pd.concat(
308+
[return_benign, return_pathogenic], axis=0, ignore_index=True
309+
)
299310

300311
@staticmethod
301312
def _get_variants_within_range(dataset, upper_bound, lower_bound):

setup.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@
2929
],
3030
python_requires='>=3.8',
3131
install_requires=[
32-
'numpy==1.22.0',
33-
'pandas==1.3.5',
34-
'scipy==1.7.3',
35-
'scikit-learn==1.0.2',
36-
'xgboost==1.4.2'
32+
'numpy==1.23.2',
33+
'pandas==1.4.4',
34+
'scipy==1.9.1',
35+
'scikit-learn==1.1.2',
36+
'xgboost==1.6.2'
3737
],
3838
extras_require={
3939
'testing': [

src/molgenis/capice/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '4.0.0-rc1'
1+
__version__ = '4.0.0-rc2'

src/molgenis/capice/main_train.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,12 @@ def train(self, test_set: pd.DataFrame, train_set: pd.DataFrame):
179179
random_state=self.model_random_state,
180180
use_label_encoder=False
181181
)
182+
model_estimator.set_params(
183+
**{
184+
'eval_metric': ["auc"],
185+
'early_stopping_rounds': self.esr
186+
}
187+
)
182188
randomised_search_cv = RandomizedSearchCV(estimator=model_estimator,
183189
param_distributions=param_dist,
184190
scoring='roc_auc', n_jobs=8,
@@ -191,8 +197,6 @@ def train(self, test_set: pd.DataFrame, train_set: pd.DataFrame):
191197
self.log.info('Random search starting, please hold.')
192198
randomised_search_cv.fit(train_set[self.processed_features],
193199
train_set[TrainEnums.binarized_label.value],
194-
early_stopping_rounds=self.esr,
195-
eval_metric=["auc"],
196200
eval_set=eval_set,
197201
verbose=xgb_verbosity,
198202
sample_weight=train_set[TrainEnums.sample_weight.value])

tests/capice/test_main_train.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,18 @@ def test_integration_training(self):
5555
best_model = str(model.__class__).split("'")[1]
5656
self.assertEqual('xgboost.sklearn.XGBClassifier', best_model)
5757

58+
def test_params(self):
59+
"""
60+
Test to see if the >1.6.2 XGBoost parameter settings are applied correctly to the model
61+
"""
62+
print('Test params')
63+
self.main.run()
64+
output_path = os.path.join(self.output_dir, self.output_filename)
65+
with open(output_path, 'rb') as model_dat:
66+
model = pickle.load(model_dat)
67+
self.assertEqual(model.get_params()['early_stopping_rounds'], 1)
68+
self.assertEqual(model.get_params()['eval_metric'], ['auc'])
69+
5870
def test_unit_split(self):
5971
"""
6072
Unit test to see if split works.
-334 KB
Binary file not shown.

0 commit comments

Comments
 (0)