Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add embedding evaluation model #201

Merged
merged 38 commits into from
Feb 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
bc44eb9
move functioN
florian-huber Feb 13, 2024
fee53ae
move functioN
florian-huber Feb 13, 2024
ab85e2f
add first embedding evaluator
florian-huber Feb 13, 2024
a8ae725
fix
florian-huber Feb 13, 2024
5ba4407
move function
florian-huber Feb 14, 2024
7ead788
add new parameters to settings
florian-huber Feb 14, 2024
391662e
switch to InceptionTime model
florian-huber Feb 14, 2024
e23b2a0
add data generator for evaluator training
florian-huber Feb 14, 2024
3146334
fix
florian-huber Feb 14, 2024
55b51ae
add missing imports
florian-huber Feb 14, 2024
29e8419
add missing imports
florian-huber Feb 14, 2024
050df06
linting & fixes
florian-huber Feb 14, 2024
65fa41e
add tests
florian-huber Feb 14, 2024
c888173
add tests
florian-huber Feb 14, 2024
68aeb9b
minor edits
florian-huber Feb 14, 2024
d514d3e
cosmetic changes
florian-huber Feb 15, 2024
126bad0
add linear model
florian-huber Feb 15, 2024
e975e9d
add tests
florian-huber Feb 15, 2024
97812f0
small updates
florian-huber Feb 15, 2024
f90f27e
add MS2DeepScore variant
florian-huber Feb 15, 2024
312b315
add scikit learn
florian-huber Feb 15, 2024
a4eb440
fix test
florian-huber Feb 15, 2024
438ce1e
add tests and linting
florian-huber Feb 15, 2024
825c734
linting
florian-huber Feb 15, 2024
607e364
update test
florian-huber Feb 15, 2024
27b62b3
fixes
florian-huber Feb 16, 2024
7caa61e
edits and fixes
florian-huber Feb 16, 2024
ffc9540
add tests
florian-huber Feb 16, 2024
f3825d3
linting
florian-huber Feb 16, 2024
408e8f0
fix
florian-huber Feb 16, 2024
17db430
linting and more tests
florian-huber Feb 16, 2024
6af0193
add documentation
florian-huber Feb 19, 2024
d97b7c9
add type hints
florian-huber Feb 19, 2024
a7407ec
expand documentation and use InceptionTime class as base class
florian-huber Feb 19, 2024
74efa9a
linting
florian-huber Feb 20, 2024
9475d5b
linting
florian-huber Feb 20, 2024
dfd1ce3
linting
florian-huber Feb 20, 2024
44a3a21
speed up model training tets
florian-huber Feb 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions ms2deepscore/MS2DeepScore.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class MS2DeepScore(BaseSimilarity):
queries = load_from_json("xyz.json")

# Load pretrained model
model = load_model("model_file_123.hdf5")
model = load_model("model_file_123.pt")

similarity_measure = MS2DeepScore(model)
# Calculate scores and get matchms.Scores object
Expand All @@ -45,8 +45,7 @@ def __init__(self, model: SiameseSpectralModel, progress_bar: bool = True):
----------
model:
Expected input is a SiameseModel that has been trained on
the desired set of spectra. The model contains the keras deep neural
network (model.model) as well as the used spectrum binner (model.spectrum_binner).
the desired set of spectra.
progress_bar:
Set to True to monitor the embedding creating with a progress bar.
Default is False.
Expand Down Expand Up @@ -102,13 +101,13 @@ def matrix(self, references: List[Spectrum], queries: List[Spectrum],
ms2ds_similarity
Array of MS2DeepScore similarity scores.
"""
embedding_reference = self.get_embedding_array(references)
embeddings_reference = self.get_embedding_array(references)
if is_symmetric:
assert np.all(references == queries), \
"Expected references to be equal to queries for is_symmetric=True"
query_embeddings = embedding_reference
embeddings_query = embeddings_reference
else:
query_embeddings = self.get_embedding_array(queries)
embeddings_query = self.get_embedding_array(queries)

ms2ds_similarity = cosine_similarity_matrix(embedding_reference, query_embeddings)
ms2ds_similarity = cosine_similarity_matrix(embeddings_reference, embeddings_query)
return ms2ds_similarity
149 changes: 149 additions & 0 deletions ms2deepscore/MS2DeepScoreEvaluated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
from typing import List
import numpy as np
from matchms import Spectrum
from matchms.similarity.BaseSimilarity import BaseSimilarity
from ms2deepscore.models.SiameseSpectralModel import (SiameseSpectralModel,
compute_embedding_array)
from ms2deepscore.models.EmbeddingEvaluatorModel import compute_error_predictions
from .vector_operations import cosine_similarity, cosine_similarity_matrix


class MS2DeepScoreEvaluated(BaseSimilarity):
"""Calculate MS2DeepScore similarity scores between a reference and a query.

Using a trained model, binned spectrums will be converted into spectrum
vectors using a deep neural network. The MS2DeepScore similarity is then
the cosine similarity score between two spectrum vectors.

Example code to calcualte MS2DeepScore similarities between query and reference
spectrums:

.. code-block:: python

from matchms import calculate_scores()
from matchms.importing import load_from_json
from ms2deepscore import MS2DeepScoreEvaluated
from ms2deepscore.models import load_model, load_linear_model

# Import data
references = load_from_json("abc.json")
queries = load_from_json("xyz.json")

# Load pretrained model
model = load_model("model_file_123.pt")
embedding_evaluator = load_model("embedding_evaluator_123.pt")
score_evaluator = load_linear_model("score_evaluator_123.json")

similarity_measure = MS2DeepScoreEvaluated(model, embedding_evaluator, score_evaluator)
# Calculate scores and get matchms.Scores object
scores = calculate_scores(references, queries, similarity_measure)

"""
# Set output data type, e.g. ("score", "float") or [("score", "float"), ("matches", "int")]
score_datatype = [("score", np.float32), ("predicted_absolute_error", np.float32)]

def __init__(self, model: SiameseSpectralModel,
embedding_evaluator,
score_evaluator,
progress_bar: bool = True):
"""

Parameters
----------
model:
Expected input is a SiameseModel that has been trained on
the desired set of spectra.
embedding_evaluator:
Model trained on predicting the score quality (in form of MSE) based on an embedding.
progress_bar:
Set to True to monitor the embedding creating with a progress bar.
Default is False.
"""
self.model = model
self.model.eval()
self.embedding_evaluator = embedding_evaluator
self.embedding_evaluator .eval()
self.score_evaluator = score_evaluator
self.output_vector_dim = self.model.model_settings.embedding_dim
self.progress_bar = progress_bar

def get_embedding_array(self, spectrums, datatype="numpy"):
return compute_embedding_array(self.model, spectrums, datatype)

def get_embedding_evaluations(self, embeddings):
"""Compute the RMSE.
"""
predicted_mse = self.embedding_evaluator(embeddings)
predicted_mse[predicted_mse < 0] = 0
return predicted_mse ** 0.5

def get_score_evaluations(self, predicted_mse1, predicted_mse2):
return compute_error_predictions(predicted_mse1, predicted_mse2, self.score_evaluator)

def pair(self, reference: Spectrum, query: Spectrum) -> float:
"""Calculate the MS2DeepScore similaritiy between a reference and a query spectrum.

Parameters
----------
reference:
Reference spectrum.
query:
Query spectrum.

Returns
-------
ms2ds_similarity
MS2DeepScore similarity score.
"""
embedding_reference = self.get_embedding_array([reference], datatype="pytorch")
embedding_query = self.get_embedding_array([query], datatype="pytorch")

embedding_ref_mse = self.get_embedding_evaluations(embedding_reference.reshape(-1, 1, self.output_vector_dim)).detach().numpy()
embedding_query_mse = self.get_embedding_evaluations(embedding_query.reshape(-1, 1, self.output_vector_dim)).detach().numpy()
score = cosine_similarity(embedding_reference[0, :].detach().numpy(), embedding_query[0, :].detach().numpy())
score_predicted_ae = self.score_evaluator.predict([[embedding_ref_mse[0][0], embedding_query_mse[0][0]]])
return np.asarray((score, score_predicted_ae),
dtype=self.score_datatype)

def matrix(self, references: List[Spectrum], queries: List[Spectrum],
array_type: str = "numpy",
is_symmetric: bool = False) -> np.ndarray:
"""Calculate the MS2DeepScore similarities between all references and queries.

Parameters
----------
references:
Reference spectrum.
queries:
Query spectrum.
array_type
Specify the output array type. Can be "numpy" or "sparse".
Currently, only "numpy" is supported and will return a numpy array.
Future versions will include "sparse" as option to return a COO-sparse array.
is_symmetric:
Set to True if references == queries to speed up calculation about 2x.
Uses the fact that in this case score[i, j] = score[j, i]. Default is False.

Returns
-------
ms2ds_similarity
Array of MS2DeepScore similarity scores.
"""
embeddings_reference = self.get_embedding_array(references, datatype="pytorch")
if is_symmetric:
assert np.all(references == queries), \
"Expected references to be equal to queries for is_symmetric=True"
embeddings_query = embeddings_reference
else:
embeddings_query = self.get_embedding_array(queries, datatype="pytorch")

embeddings_ref_mse = self.get_embedding_evaluations(embeddings_reference.reshape(-1, 1, self.output_vector_dim)).detach().numpy()
embeddings_query_mse = self.get_embedding_evaluations(embeddings_query.reshape(-1, 1, self.output_vector_dim)).detach().numpy()

ms2ds_similarity = cosine_similarity_matrix(embeddings_reference.detach().numpy(), embeddings_query.detach().numpy())
ms2ds_uncertainty = self.get_score_evaluations(embeddings_ref_mse, embeddings_query_mse)
similarities=np.empty((ms2ds_similarity.shape[0],
ms2ds_similarity.shape[1]), dtype=self.score_datatype)
similarities["score"] = ms2ds_similarity
similarities["predicted_absolute_error"] = ms2ds_uncertainty
return similarities
2 changes: 1 addition & 1 deletion ms2deepscore/MS2DeepScoreMonteCarlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class MS2DeepScoreMonteCarlo(BaseSimilarity):
# Set key characteristics as class attributes
is_commutative = True
# Set output data type, e.g. ("score", "float") or [("score", "float"), ("matches", "int")]
score_datatype = [("score", np.float64), ("lower_bound", np.float64), ("upper_bound", np.float64)]
score_datatype = [("score", np.float32), ("lower_bound", np.float32), ("upper_bound", np.float32)]

def __init__(self, model,
n_ensembles: int = 10,
Expand Down
6 changes: 6 additions & 0 deletions ms2deepscore/SettingsMS2Deepscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,12 @@ def __init__(self, **settings):
self.augment_noise_max = 10
self.augment_noise_intensity = 0.02

# Settings for embedding evaluator model
self.evaluator_distribution_size = 1000
self.evaluator_num_filters = 48
self.evaluator_depth = 3
self.evaluator_kernel_size = 20

if settings:
for key, value in settings.items():
if hasattr(self, key):
Expand Down
2 changes: 2 additions & 0 deletions ms2deepscore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from . import models
from .__version__ import __version__
from .MS2DeepScore import MS2DeepScore
from .MS2DeepScoreEvaluated import MS2DeepScoreEvaluated
from .MS2DeepScoreMonteCarlo import MS2DeepScoreMonteCarlo
from .SettingsMS2Deepscore import SettingsMS2Deepscore

Expand All @@ -13,6 +14,7 @@
"models",
"__version__",
"MS2DeepScore",
"MS2DeepScoreEvaluated",
"MS2DeepScoreMonteCarlo",
"SettingsMS2Deepscore",
]
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,7 @@
import numpy as np
from matchms.Spectrum import Spectrum
from tqdm import tqdm


def remove_diagonal(matrix):
"""Removes the diagonal from a matrix

meant for removing matches of spectra against itself. """
# Get the number of rows and columns
nr_of_rows, nr_of_cols = matrix.shape
if nr_of_rows != nr_of_cols:
raise ValueError("Expected predictions against itself")

# Create a mask for the diagonal elements
diagonal_mask = np.eye(nr_of_rows, dtype=bool)
# Use the mask to remove the diagonal elements
matrix_without_diagonal = matrix[~diagonal_mask].reshape(nr_of_rows, nr_of_cols - 1)
return matrix_without_diagonal
from ms2deepscore.utils import remove_diagonal


def select_one_spectrum_per_inchikey(spectra):
Expand Down
Loading
Loading