Skip to content

Commit

Permalink
feat(text-clustering): add a route for lodex
Browse files Browse the repository at this point in the history
  • Loading branch information
leogail committed Jul 31, 2024
1 parent 2378807 commit 642a214
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 15 deletions.
61 changes: 61 additions & 0 deletions services/text-clustering/v1/noise-lodex.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Entrypoint output format
mimeType = application/json

# OpenAPI Documentation - JSON format (dot notation)
post.operationId = post-v1-noise-lodex
post.summary = Dans un corpus, retourne la liste des identifiants des documents considérés comme du bruit.
post.description = Utilise l'algorithme de *clustering* et retourne uniquement la liste des identifiants considérés comme du bruit par [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html). Route à utiliser dans Lodex
post.tags.0 = clustering
post.requestBody.content.application/x-gzip.schema.type = string
post.requestBody.content.application/x-gzip.schema.format = binary
post.requestBody.required = true
post.responses.default.description = Informations permettant de récupérer les données le moment venu
post.parameters.0.description = Indenter le JSON résultant
post.parameters.0.in = query
post.parameters.0.name = indent
post.parameters.0.schema.type = boolean
post.parameters.1.description = URL pour signaler que le traitement est terminé
post.parameters.1.in = header
post.parameters.1.name = X-Webhook-Success
post.parameters.1.schema.type = string
post.parameters.1.schema.format = uri
post.parameters.1.required = false
post.parameters.2.description = URL pour signaler que le traitement a échoué
post.parameters.2.in = header
post.parameters.2.name = X-Webhook-Failure
post.parameters.2.schema.type = string
post.parameters.2.schema.format = uri
post.parameters.2.required = false
#'

[env]
path = generator
value = noise-lodex

[use]
plugin = basics
plugin = analytics
plugin = spawn

# Step 1 (générique): Charger le fichier corpus
[delegate]
file = charger.cfg

# Step 2 (générique): Traiter de manière asynchrone les items reçus
[fork]
standalone = true
logger = logger.cfg

# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus
[fork/exec]
# command should be executable !
command = ./v1/noise.py

# Step 2.2 (générique): Enregistrer le résultat et signaler que le traitement est fini
[fork/delegate]
file = recorder.cfg

# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résultat quand il sera prêt
[delegate]
file = recipient.cfg

16 changes: 1 addition & 15 deletions services/text-clustering/v1/noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_distances
# Two hdbscan aglos : normal and from sklearn
# import hdbscan
from sklearn.cluster import HDBSCAN

# from prometheus_client import CollectorRegistry, Counter, push_to_gateway
Expand Down Expand Up @@ -78,18 +76,6 @@ def center_reduce(matrix):
cosine_dist_matrix = cosine_distances(embeddings, embeddings)


## HDBSCAN with hdbscan library
# clusterer = hdbscan.HDBSCAN(algorithm='best',
# prediction_data=True,
# approx_min_span_tree=True,
# gen_min_span_tree=True,
# min_cluster_size=int(max(10,len_data/50)),
# cluster_selection_epsilon = 0.02,
# min_samples=1,
# p=None,
# metric='precomputed',
# cluster_selection_method='eom')

# HDBSCAN with scikit-learn
clusterer = HDBSCAN(
algorithm='auto',
Expand Down Expand Up @@ -118,7 +104,7 @@ def center_reduce(matrix):

# Write all corpus in once
if len(output)==0:
sys.stdout.write(json.dumps({"value":"No noise in your datas"}))
sys.stdout.write(json.dumps({"id":"n/a","value":""}))
sys.stdout.write("\n")
else :
for line in output:
Expand Down

0 comments on commit 642a214

Please sign in to comment.