feat(text-clustering): add a route for lodex

Inist-CNRS · Jul 31, 2024 · 642a214 · 642a214
1 parent 2378807
commit 642a214
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 15 deletions.
diff --git a/services/text-clustering/v1/noise-lodex.ini b/services/text-clustering/v1/noise-lodex.ini
@@ -0,0 +1,61 @@
+# Entrypoint output format
+mimeType = application/json
+
+# OpenAPI Documentation - JSON format (dot notation)
+post.operationId = post-v1-noise-lodex
+post.summary = Dans un corpus, retourne la liste des identifiants des documents considérés comme du bruit.
+post.description = Utilise l'algorithme de *clustering* et retourne uniquement la liste des identifiants considérés comme du bruit par [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html). Route à utiliser dans Lodex
+post.tags.0 = clustering
+post.requestBody.content.application/x-gzip.schema.type = string
+post.requestBody.content.application/x-gzip.schema.format = binary
+post.requestBody.required = true
+post.responses.default.description = Informations permettant de récupérer les données le moment venu
+post.parameters.0.description = Indenter le JSON résultant
+post.parameters.0.in = query
+post.parameters.0.name = indent
+post.parameters.0.schema.type = boolean
+post.parameters.1.description = URL pour signaler que le traitement est terminé
+post.parameters.1.in = header
+post.parameters.1.name = X-Webhook-Success
+post.parameters.1.schema.type = string
+post.parameters.1.schema.format = uri
+post.parameters.1.required = false
+post.parameters.2.description = URL pour signaler que le traitement a échoué
+post.parameters.2.in = header
+post.parameters.2.name = X-Webhook-Failure
+post.parameters.2.schema.type = string
+post.parameters.2.schema.format = uri
+post.parameters.2.required = false
+#'
+
+[env]
+path = generator
+value = noise-lodex
+
+[use]
+plugin = basics
+plugin = analytics
+plugin = spawn
+
+# Step 1 (générique): Charger le fichier corpus
+[delegate]
+file = charger.cfg
+
+# Step 2 (générique): Traiter de manière asynchrone les items reçus
+[fork]
+standalone = true
+logger = logger.cfg
+
+# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus
+[fork/exec]
+# command should be executable !
+command = ./v1/noise.py
+
+# Step 2.2 (générique): Enregistrer le résultat et signaler que le traitement est fini
+[fork/delegate]
+file = recorder.cfg
+
+# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résultat quand il sera prêt
+[delegate]
+file = recipient.cfg
+
diff --git a/services/text-clustering/v1/noise.py b/services/text-clustering/v1/noise.py
@@ -6,8 +6,6 @@
 from sentence_transformers import SentenceTransformer
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics.pairwise import cosine_distances
-# Two hdbscan aglos : normal and from sklearn
-# import hdbscan
 from sklearn.cluster import HDBSCAN
 
 # from prometheus_client import CollectorRegistry, Counter, push_to_gateway
@@ -78,18 +76,6 @@ def center_reduce(matrix):
 cosine_dist_matrix = cosine_distances(embeddings, embeddings)
 
 
-## HDBSCAN with hdbscan library
-# clusterer = hdbscan.HDBSCAN(algorithm='best',
-#                             prediction_data=True,
-#                             approx_min_span_tree=True,
-#                             gen_min_span_tree=True,
-#                             min_cluster_size=int(max(10,len_data/50)),
-#                             cluster_selection_epsilon = 0.02,
-#                             min_samples=1,
-#                             p=None,
-#                             metric='precomputed',
-#                             cluster_selection_method='eom')
-
 # HDBSCAN with scikit-learn
 clusterer = HDBSCAN(
     algorithm='auto',
@@ -118,7 +104,7 @@ def center_reduce(matrix):
 
 # Write all corpus in once
 if len(output)==0:
-    sys.stdout.write(json.dumps({"value":"No noise in your datas"}))
+    sys.stdout.write(json.dumps({"id":"n/a","value":""}))
     sys.stdout.write("\n")
 else :
     for line in output: