From 642a214c06e8f254d4f9edb1012e2d8344b00231 Mon Sep 17 00:00:00 2001 From: leogail Date: Wed, 31 Jul 2024 10:46:29 +0200 Subject: [PATCH] feat(text-clustering): add a route for lodex --- services/text-clustering/v1/noise-lodex.ini | 61 +++++++++++++++++++++ services/text-clustering/v1/noise.py | 16 +----- 2 files changed, 62 insertions(+), 15 deletions(-) create mode 100644 services/text-clustering/v1/noise-lodex.ini diff --git a/services/text-clustering/v1/noise-lodex.ini b/services/text-clustering/v1/noise-lodex.ini new file mode 100644 index 00000000..e3f2d275 --- /dev/null +++ b/services/text-clustering/v1/noise-lodex.ini @@ -0,0 +1,61 @@ +# Entrypoint output format +mimeType = application/json + +# OpenAPI Documentation - JSON format (dot notation) +post.operationId = post-v1-noise-lodex +post.summary = Dans un corpus, retourne la liste des identifiants des documents considérés comme du bruit. +post.description = Utilise l'algorithme de *clustering* et retourne uniquement la liste des identifiants considérés comme du bruit par [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html). Route à utiliser dans Lodex +post.tags.0 = clustering +post.requestBody.content.application/x-gzip.schema.type = string +post.requestBody.content.application/x-gzip.schema.format = binary +post.requestBody.required = true +post.responses.default.description = Informations permettant de récupérer les données le moment venu +post.parameters.0.description = Indenter le JSON résultant +post.parameters.0.in = query +post.parameters.0.name = indent +post.parameters.0.schema.type = boolean +post.parameters.1.description = URL pour signaler que le traitement est terminé +post.parameters.1.in = header +post.parameters.1.name = X-Webhook-Success +post.parameters.1.schema.type = string +post.parameters.1.schema.format = uri +post.parameters.1.required = false +post.parameters.2.description = URL pour signaler que le traitement a échoué +post.parameters.2.in = header +post.parameters.2.name = X-Webhook-Failure +post.parameters.2.schema.type = string +post.parameters.2.schema.format = uri +post.parameters.2.required = false +#' + +[env] +path = generator +value = noise-lodex + +[use] +plugin = basics +plugin = analytics +plugin = spawn + +# Step 1 (générique): Charger le fichier corpus +[delegate] +file = charger.cfg + +# Step 2 (générique): Traiter de manière asynchrone les items reçus +[fork] +standalone = true +logger = logger.cfg + +# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus +[fork/exec] +# command should be executable ! +command = ./v1/noise.py + +# Step 2.2 (générique): Enregistrer le résultat et signaler que le traitement est fini +[fork/delegate] +file = recorder.cfg + +# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résultat quand il sera prêt +[delegate] +file = recipient.cfg + diff --git a/services/text-clustering/v1/noise.py b/services/text-clustering/v1/noise.py index 8593f68e..f723a614 100755 --- a/services/text-clustering/v1/noise.py +++ b/services/text-clustering/v1/noise.py @@ -6,8 +6,6 @@ from sentence_transformers import SentenceTransformer from sklearn.preprocessing import StandardScaler from sklearn.metrics.pairwise import cosine_distances -# Two hdbscan aglos : normal and from sklearn -# import hdbscan from sklearn.cluster import HDBSCAN # from prometheus_client import CollectorRegistry, Counter, push_to_gateway @@ -78,18 +76,6 @@ def center_reduce(matrix): cosine_dist_matrix = cosine_distances(embeddings, embeddings) -## HDBSCAN with hdbscan library -# clusterer = hdbscan.HDBSCAN(algorithm='best', -# prediction_data=True, -# approx_min_span_tree=True, -# gen_min_span_tree=True, -# min_cluster_size=int(max(10,len_data/50)), -# cluster_selection_epsilon = 0.02, -# min_samples=1, -# p=None, -# metric='precomputed', -# cluster_selection_method='eom') - # HDBSCAN with scikit-learn clusterer = HDBSCAN( algorithm='auto', @@ -118,7 +104,7 @@ def center_reduce(matrix): # Write all corpus in once if len(output)==0: - sys.stdout.write(json.dumps({"value":"No noise in your datas"})) + sys.stdout.write(json.dumps({"id":"n/a","value":""})) sys.stdout.write("\n") else : for line in output: