diff --git a/services/data-computer/README.md b/services/data-computer/README.md index b101ca0b..478e598c 100644 --- a/services/data-computer/README.md +++ b/services/data-computer/README.md @@ -1,10 +1,9 @@ -# ws-data-computer@2.12.6 +# ws-data-computer@2.13.1 Le service `data-computer` offre plusieurs services **asynchrones** pour des calculs et de transformations de données simples. *Tous les services proposés acceptent uniquement en entrée des fichiers corpus standards au format tar.gz.* - ## Utilisation - [v1/tree-segment](#v1%2ftree-segment) @@ -256,3 +255,161 @@ cat input.tar.gz |curl --data-binary @- -H "X-Hook: https://webhook.site/dce2fe # When the corpus is processed, get the result cat output.json |curl --data-binary @- "http://localhost:31976/v1/retrieve" > output.tar.gz +``` + + +### v1/corpus-similarity + +Compare des petits documents (Titre, phrases, petits *abstracts*) entre eux, et renvoie pour chaque document les documents qui lui sont similaires. +Il est conseillé d'utiliser cette route avec au moins 6-7 documents dans le corpus. + +Il existe un paramètre optionnel `output` pour choisir le type de sortie en fonction de sa valeur: +- 0 (par défaut) : l'algorithme choisit automatiquement les documents les plus similaires à chaque document +- 1 : l'algorithme renvoie pour chaque document tous les documents, classés par ordre de proximité (les plus similaires en premier) +- *n* (avec *n* un entier plus grand que 1) : l'algorithme renvoie pour chaque document les *n* documents les plus proches, classés par ordre de proximité (les plus similaires en premier), ainsi que le score de similarité associé à chaque document. +par exemple en utilisant `example-similarity-json.tar.gz` avec le paramètre output par défaut (0), obtiendra : + +> **Attention** : Le champ ID est utilisé comme référence de chaque document. + +par exemple en utilisant `example-similarity-json.tar.gz` avec le paramètre output par défaut (0), obtiendra : + +```json +[ + { + "id": "Titre 1", + "value": { + "similarity": [ + "Titre 4", + "Titre 2" + ], + "score": [ + 0.9411764705882353, + 0.9349112426035503 + ] + } + }, + { + "id": "Titre 2", + "value": { + "similarity": [ + "Titre 1" + ], + "score": [ + 0.9349112426035503 + ] + } + }, + { + "id": "Titre 3", + "value": { + "similarity": [ + "Titre 4" + ], + "score": [ + 0.8888888888888888 + ] + } + }, + { + "id": "Titre 4", + "value": { + "similarity": [ + "Titre 1" + ], + "score": [ + 0.9411764705882353 + ] + } + } +] +``` + +Avec le paramètre output=3, on obtiendra : + +```json +[ + { + "id": "Titre 1", + "value": { + "similarity": [ + "Titre 4", + "Titre 2", + "Titre 3" + ], + "score": [ + 0.9411764705882353, + 0.9349112426035503, + 0.8757396449704142 + ] + } + }, + { + "id": "Titre 2", + "value": { + "similarity": [ + "Titre 1", + "Titre 4", + "Titre 3" + ], + "score": [ + 0.9349112426035503, + 0.8888888888888888, + 0.8651685393258427 + ] + } + }, + { + "id": "Titre 3", + "value": { + "similarity": [ + "Titre 4", + "Titre 1", + "Titre 2" + ], + "score": [ + 0.8888888888888888, + 0.8757396449704142, + 0.8651685393258427 + ] + } + }, + { + "id": "Titre 4", + "value": { + "similarity": [ + "Titre 1", + "Titre 3", + "Titre 2" + ], + "score": [ + 0.9411764705882353, + 0.8888888888888888, + 0.8888888888888888 + ] + } + } +] +``` + +#### Paramètre(s) URL + +| nom | description | +| ------------------- | ------------------------------------------- | +| indent (true/false) | Indenter le résultat renvoyer immédiatement | +| output (0,1,n) | Choix de la sortie | + +#### Entête(s) HTTP + +| nom | description | +| ------ | ------------------------------------------------------------ | +| X-Hook | URL à appeler quand le résultat sera disponible (facultatif) | + +#### Exemple en ligne de commande + + +```bash +# Send data for batch processing +cat input.tar.gz |curl --data-binary @- -H "X-Hook: https://webhook.site/dce2fefa-9a72-4f76-96e5-059405a04f6c" "http://localhost:31976/v1/similarity" > output.json + +# When the corpus is processed, get the result +cat output.json |curl --data-binary @- "http://localhost:31976/v1/retrieve" > output.tar.gz diff --git a/services/data-computer/example-similarity-json.tar.gz b/services/data-computer/example-similarity-json.tar.gz new file mode 100644 index 00000000..5316bb15 Binary files /dev/null and b/services/data-computer/example-similarity-json.tar.gz differ diff --git a/services/data-computer/examples.http b/services/data-computer/examples.http index be0dbf2a..1860709a 100644 --- a/services/data-computer/examples.http +++ b/services/data-computer/examples.http @@ -113,3 +113,14 @@ X-Webhook-Success: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 X-Webhook-Failure: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 < ./example-json.tar.gz + + +### +# @name v1CorpusSimilarity +POST {{host}}/v1/corpus-similarity HTTP/1.1 +Content-Type: application/x-tar +X-Webhook-Success: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 +X-Webhook-Failure: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 + +< ./example-similarity-json.tar.gz + diff --git a/services/data-computer/package.json b/services/data-computer/package.json index 41b1e1f6..40b3fddd 100644 --- a/services/data-computer/package.json +++ b/services/data-computer/package.json @@ -1,7 +1,7 @@ { "private": true, "name": "ws-data-computer", - "version": "2.12.6", + "version": "2.13.1", "description": "Calculs sur fichier corpus compressé", "repository": { "type": "git", diff --git a/services/data-computer/swagger.json b/services/data-computer/swagger.json index c7db719a..495bc8f0 100644 --- a/services/data-computer/swagger.json +++ b/services/data-computer/swagger.json @@ -3,7 +3,7 @@ "info": { "title": "data-computer - Calculs sur fichier corpus compressé", "summary": "Calculs sur un corpus compressé", - "version": "2.12.6", + "version": "2.13.1", "termsOfService": "https://services.istex.fr/", "contact": { "name": "Inist-CNRS", diff --git a/services/data-computer/tests.hurl b/services/data-computer/tests.hurl index d06c5bda..d1a9df6c 100644 --- a/services/data-computer/tests.hurl +++ b/services/data-computer/tests.hurl @@ -72,4 +72,87 @@ delay: 1000 HTTP 200 [{"id":"#1","value":{"sample":2,"frequency":0.6666666666666666,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"a"}},{"id":"#2","value":{"sample":2,"frequency":0.6666666666666666,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"b"}},{"id":"#3","value":{"sample":1,"frequency":0.3333333333333333,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"c"}},{"id":"#4","value":{"sample":2,"frequency":0.6666666666666666,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"a"}},{"id":"#5","value":{"sample":2,"frequency":0.6666666666666666,"percentage":null,"sum":0,"count":5,"min":0,"max":0,"mean":0,"range":0,"midrange":0,"variance":0,"deviation":0,"population":3,"input":"b"}}] +################################ Test for Similarity ################################ + +POST {{host}}/v1/corpus-similarity +content-type: application/x-tar +x-hook: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 +file,example-similarity-json.tar.gz; + +HTTP 200 +# Capture the computing token +[Captures] +computing_token: jsonpath "$[0].value" +[Asserts] +variable "computing_token" exists + +# There should be a waiting time, representing the time taken to process data. +# Fortunately, as the data is sparse, and the computing time is small, +# the need is small. + +# Version 4.1.0 of hurl added a delay option, which value is milliseconds. +# https://hurl.dev/blog/2023/09/24/announcing-hurl-4.1.0.html#add-delay-between-requests + +POST {{host}}/v1/retrieve-json?indent=true +content-type: application/json +[Options] +delay: 1000 +``` +[ + { + "value":"{{computing_token}}" + } +] +``` + +HTTP 200 +[{ + "id": "Titre 1", + "value": { + "similarity": [ + "Titre 4", + "Titre 2" + ], + "score": [ + 0.9411764705882353, + 0.9349112426035503 + ] + } +}, +{ + "id": "Titre 2", + "value": { + "similarity": [ + "Titre 1" + ], + "score": [ + 0.9349112426035503 + ] + } +}, +{ + "id": "Titre 3", + "value": { + "similarity": [ + "Titre 4" + ], + "score": [ + 0.8888888888888888 + ] + } +}, +{ + "id": "Titre 4", + "value": { + "similarity": [ + "Titre 1" + ], + "score": [ + 0.9411764705882353 + ] + } +}] + + # TODO: ajouter les deux autres routes (v1GraphSegment, v1Lda) +# TODO: ajouter la route rapido \ No newline at end of file diff --git a/services/data-computer/v1/corpus-similarity.ini b/services/data-computer/v1/corpus-similarity.ini new file mode 100644 index 00000000..c50fceeb --- /dev/null +++ b/services/data-computer/v1/corpus-similarity.ini @@ -0,0 +1,65 @@ +# OpenAPI Documentation - JSON format (dot notation) +mimeType = application/json + +post.operationId = post-v1-corpus-similarity +post.description = Web service de calcul de similarité entre documents d un corpus +post.summary = 3 sorties sont disponibles +post.tags.0 = data-computer +post.requestBody.content.application/x-tar.schema.type = string +post.requestBody.content.application/x-tar.schema.format = binary +post.requestBody.required = true +post.responses.default.description = Informations permettant de récupérer les données le moment venu +post.parameters.0.description = Indenter le JSON résultant +post.parameters.0.in = query +post.parameters.0.name = indent +post.parameters.0.schema.type = boolean +post.parameters.1.description = URL pour signaler que le traitement est terminé +post.parameters.1.in = header +post.parameters.1.name = X-Webhook-Success +post.parameters.1.schema.type = string +post.parameters.1.schema.format = uri +post.parameters.1.required = false +post.parameters.2.description = URL pour signaler que le traitement a échoué +post.parameters.2.in = header +post.parameters.2.name = X-Webhook-Failure +post.parameters.2.schema.type = string +post.parameters.2.schema.format = uri +post.parameters.2.required = false + +post.parameters.3.in = query +post.parameters.3.name = output +post.parameters.3.schema.type = int +post.parameters.3.description = Choix du nombre de documents similaires à afficher dans la sortie : 0 pour automatique, 1 pour tout afficher, n'importe quel autre nombre pour afficher au maximum ce nombre d'élements. + + +[env] +path = generator +value = corpus-similarity + +[use] +plugin = basics +plugin = spawn + +# Step 1 (générique): Charger le fichier corpus +[delegate] +file = charger.cfg + +# Step 2 (générique): Traiter de manière asynchrone les items reçus +[fork] +standalone = true +logger = logger.cfg + +# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus +[fork/exec] +# command should be executable ! +command = ./v1/corpus-similarity.py +args = fix('-p') +args = env('output', "0") + +# Step 2.2 (générique): Enregistrer le résultat et signaler que le traitement est fini +[fork/delegate] +file = recorder.cfg + +# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résultat quand il sera prêt +[delegate] +file = recipient.cfg diff --git a/services/data-computer/v1/corpus-similarity.py b/services/data-computer/v1/corpus-similarity.py new file mode 100755 index 00000000..c5dc00cd --- /dev/null +++ b/services/data-computer/v1/corpus-similarity.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import json +import sys +from difflib import SequenceMatcher +import numpy as np + +def get_ratio(data): + currentTitle = data['value'] + currentId = data['id'] + idList = [] + ratioList = [] + + for _,line_cmp in enumerate(all_data): + data_cmp = line_cmp[0] + id,title = data_cmp["id"],data_cmp["value"] + if currentId == id: + continue + ratio = SequenceMatcher(None, currentTitle, title).ratio() + idList.append(id) + ratioList.append(ratio) + + #Sort both lists according to ratioList + ratioList,idList = (list(t) for t in zip(*sorted(zip(ratioList, idList),reverse=True))) + + return currentId, ratioList,idList + +# load all datas +all_data = [] +for line in sys.stdin: + data=json.loads(line) + all_data.append(data) + + +output = int(sys.argv[sys.argv.index('-p') + 1] if '-p' in sys.argv else 0) + +for line in all_data: + id, ratioList, idList = get_ratio(line[0]) + if output == 0: + if ratioList[0] < 0.6: + sim = [] + score = [] + else: + diff = -np.diff(ratioList) + mean = np.mean(diff) + argmx = np.argmax(diff-mean) + sim = idList[:argmx+1] + score = ratioList[:argmx+1] + elif output == 1: + sim = idList + score = ratioList + else: + sim = idList[:min(len(idList),output)] + score = ratioList[:min(len(idList),output)] + + sys.stdout.write(json.dumps({"id":id,"value":{"similarity":sim, "score":score}})) + sys.stdout.write('\n') diff --git a/services/data-computer/v1/lda.py b/services/data-computer/v1/lda.py index 847fe604..e2ebda84 100755 --- a/services/data-computer/v1/lda.py +++ b/services/data-computer/v1/lda.py @@ -85,9 +85,9 @@ def max_topic(dico): # following parameters depends of the size of the corpus : num_topics and num_iterations len_data = len(all_data) -num_iterations= max(200,len_data/100) -if len_data < 200: - num_iterations = 100 +num_iterations= min(max(400,len_data/200),1000) +if len_data < 500: + num_iterations = 200 # training LDA texts = []