Skip to content

Commit 4f76e37

Browse files
authored
feat(search): remove usage of NLP to search document (#1426)
* fix: byebye * fix: merge * fix: limit * fix: limit * fix: limit * fix: limit * fix: merge * fix: doc
1 parent 67b3d5a commit 4f76e37

File tree

15 files changed

+9
-213
lines changed

15 files changed

+9
-213
lines changed

.kontinuous/env/dev/templates/export.configmap.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,8 @@ metadata:
44
name: export-elasticsearch
55
data:
66
HASURA_GRAPHQL_ENDPOINT: "http://hasura/v1/graphql"
7-
NLP_URL: "https://serving-ml-preprod.ovh.fabrique.social.gouv.fr"
87
NODE_ENV: "production"
98
LOG_LEVEL: "info"
10-
NLP_PREPROD_DISABLE: "true"
119
BUCKET_DEFAULT_FOLDER: "default"
1210
BUCKET_DRAFT_FOLDER: "draft"
1311
BUCKET_PREVIEW_FOLDER: "preview"
@@ -20,3 +18,4 @@ data:
2018
ELASTICSEARCH_INDEX_PREPROD: "cdtn-dev-v2"
2119
ELASTICSEARCH_INDEX_PROD: "cdtn-dev-v2"
2220
MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille-dev"
21+
DISABLE_LIMIT_EXPORT: "true"

.kontinuous/env/preprod/templates/export.configmap.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ metadata:
44
name: export-elasticsearch
55
data:
66
HASURA_GRAPHQL_ENDPOINT: "http://hasura/v1/graphql"
7-
NLP_URL: "https://serving-ml.fabrique.social.gouv.fr"
87
NODE_ENV: "production"
98
LOG_LEVEL: "info"
109
BUCKET_DEFAULT_FOLDER: "default"
@@ -19,3 +18,4 @@ data:
1918
ELASTICSEARCH_INDEX_PREPROD: "cdtn-main-v2"
2019
ELASTICSEARCH_INDEX_PROD: "cdtn-main-v2"
2120
MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille-preprod"
21+
DISABLE_LIMIT_EXPORT: "true"

.kontinuous/env/prod/templates/export.configmap.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ metadata:
44
name: export-elasticsearch
55
data:
66
HASURA_GRAPHQL_ENDPOINT: "http://hasura/v1/graphql"
7-
NLP_URL: "https://serving-ml.fabrique.social.gouv.fr"
87
NODE_ENV: "production"
98
LOG_LEVEL: "info"
109
BUCKET_DEFAULT_FOLDER: "default"
@@ -18,4 +17,4 @@ data:
1817
AGREEMENTS_DESTINATION_NAME: "index.json"
1918
ELASTICSEARCH_INDEX_PREPROD: "cdtn-preprod-v2"
2019
ELASTICSEARCH_INDEX_PROD: "cdtn-prod-v2"
21-
MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille"
20+
MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille"

README.md

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,6 @@ DISABLE_LIMIT_EXPORT=true DISABLE_AGREEMENTS=true DISABLE_SITEMAP=true HASURA_GR
157157
- `DISABLE_COPY` is used to disable copy between two containers
158158
- `DISABLE_SITEMAP` is used to disable copy of the sitemap
159159
- `DISABLE_AGREEMENTS` is used to disable copy of the agreements
160-
- `NLP_URL` could be set by `https://serving-ml-preprod.ovh.fabrique.social.gouv.fr`, by default it is `undefined`
161-
162-
> **Note**: You can remove `NLP_URL` from your environment variables if you don't want to use the NLP service and gain time during the process of ingester elasticsearch.
163160

164161
#### 6. Run the export elasticsearch
165162

@@ -178,7 +175,7 @@ yarn workspace frontend dev
178175
#### On client
179176

180177
```sh
181-
NLP_URL=https://serving-ml-preprod.ovh.fabrique.social.gouv.fr yarn workspace @cdt/frontend dev
178+
yarn workspace @cdt/frontend dev
182179
```
183180

184181
1. Go to `http://localhost:3001/`
@@ -292,22 +289,6 @@ Cela permet de lier l'index elasticsearch automatiquement entre les deux branche
292289

293290
L'export des données se fait depuis l'admin dans la section `Contenus > Mise à jour`. Il faut ensuite cliquer sur le bouton `Mettre à jour la pre-production`.
294291

295-
> Note: Le glossary (injection des tooltips) et le NLP (vectorisation des données) sont par défaut désactivé en dev.
296-
297-
#### Activer le glossary et le NLP
298-
299-
Il faut commencer par donner les ressources nécessaires au processus dans l'environnement de dev :
300-
301-
- Ouvrir le fichier `.kontinous/env/dev/values.yaml`
302-
- Appliquer ce que les commentaires indiquent pour les ressources sur hasura et export
303-
304-
L'export des données se fait depuis l'admin dans la section `Contenus > Mise à jour`. Il faut ensuite cliquer sur le bouton `Mettre à jour la production`.
305-
306-
<strong>/!\ /!\ /!\ ATTENTION /!\ /!\ /!\ : Bien penser à remettre les lignes en commentaire avant de merger dans master !</strong>
307-
308-
> Pourquoi changer les ressources ?
309-
> L'export avec glossary et NLP est un processus qui demande beaucoup de RAM/CPU. Afin de ne pas surcharger le cluster de dev, on ne va pas demander ces ressources car l'export est peu utilisé pour les tests. Il n'existe aucun mécanisme sur la CI à l'heure actuelle pour permettre de faire le switch autrement.
310-
311292
### Limitations connues
312293

313294
- Les fichiers du site sont stockés au même endroit pour l'ensemble des branches. Si on ajoute/modifie/supprime un fichier, cela sera également le cas sur l'ensemble des branches

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,8 @@ services:
129129
HASURA_GRAPHQL_ENDPOINT: "http://hasura:8080/v1/graphql"
130130
DISABLE_COPY: "true"
131131
DISABLE_SITEMAP: "true"
132+
DISABLE_LIMIT_EXPORT: "true"
132133
DISABLE_AGREEMENTS: "true"
133-
NLP_URL: "https://serving-ml-preprod.ovh.fabrique.social.gouv.fr"
134134
ELASTICSEARCH_INDEX_PREPROD: "cdtn-preprod-v1"
135135
ELASTICSEARCH_INDEX_PROD: "cdtn-v1"
136136
ELASTICSEARCH_URL_PREPROD: "http://elasticsearch:9200"

shared/elasticsearch/package.json

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@
77
"@babel/plugin-transform-modules-commonjs"
88
]
99
},
10-
"dependencies": {
11-
"got": "^11.8.2"
12-
},
1310
"license": "Apache-2.0",
1411
"main": "src/index.js",
1512
"publishConfig": {

shared/elasticsearch/src/index.d.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,5 @@ export const documentMapping: any;
44
export const DOCUMENTS: string;
55
export const indexDocumentsBatched: any;
66
export const SUGGESTIONS: string;
7-
export const vectorizeDocument: any;
87
export const version: any;
98
export const suggestionMapping: any;
10-
export const vectorizeQuery: any;

shared/elasticsearch/src/mapping/document.mapping.js

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -236,14 +236,11 @@ exports.documentMapping = {
236236
type: "text",
237237
},
238238

239-
title_vector: {
240-
dims: 512,
241-
type: "dense_vector",
242-
},
243239
// The source URL
244240
url: {
245241
type: "keyword",
246242
},
243+
247244
// used in prequalifieds
248245
variants: {
249246
type: "text",
Lines changed: 1 addition & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,12 @@
1-
// vectorizer is imported by code-du-travail-api which is using CommonJS, and throwing an exception
2-
// when requiring code-du-travail-data ES module, thus we keep using CommonJS import here
3-
const got = require("got");
41
const { stopwords: semantic_stopwords } = require("../dataset/stop_words");
52

6-
// URL of the TF serve deployment
7-
const NLP_URL =
8-
process.env.NLP_URL || "https://serving-ml.fabrique.social.gouv.fr";
9-
console.log("NLP URL:", NLP_URL);
10-
const tfServeURL = NLP_URL + "/v1/models/sentqam:predict";
11-
123
function stripAccents(text) {
134
// strip accents
145
return text.normalize("NFD").replace(/[\u0300-\u036f]/g, "");
156
}
167

178
const stopWords = new Set(semantic_stopwords.map(stripAccents));
189

19-
const cache = new Map();
20-
2110
function preprocess(text) {
2211
const stripped = stripAccents(text);
2312

@@ -31,48 +20,4 @@ function preprocess(text) {
3120
return noStopWords.join(" ");
3221
}
3322

34-
async function callTFServe(json) {
35-
const response = await got.post(tfServeURL, {
36-
cache,
37-
json,
38-
responseType: "json",
39-
retry: {
40-
limit: 15,
41-
methods: ["POST"],
42-
},
43-
});
44-
return response.body["outputs"];
45-
}
46-
47-
async function vectorizeDocument(title, content) {
48-
if (title == undefined || title == "") {
49-
throw new Error("Cannot vectorize document with empty title.");
50-
}
51-
52-
const input = [preprocess(title)];
53-
const context = content ? [preprocess(content)] : "";
54-
55-
const body = {
56-
inputs: { context, input },
57-
signature_name: "response_encoder",
58-
};
59-
const vectors = await callTFServe(body);
60-
61-
return vectors[0];
62-
}
63-
64-
async function vectorizeQuery(query) {
65-
if (!query) {
66-
throw new Error("Cannot vectorize empty query.");
67-
}
68-
69-
const inputs = [preprocess(query)];
70-
const body = {
71-
inputs,
72-
signature_name: "question_encoder",
73-
};
74-
const vectors = await callTFServe(body);
75-
return vectors[0];
76-
}
77-
78-
module.exports = { preprocess, vectorizeDocument, vectorizeQuery };
23+
module.exports = { preprocess };

shared/elasticsearch/src/vectorizer/index.test.js

Lines changed: 1 addition & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,4 @@
1-
const { vectorizeDocument, vectorizeQuery, preprocess } = require("./index");
2-
3-
const timeout = 10000;
4-
5-
test(
6-
"Should vectorize document",
7-
async () => {
8-
const vector1 = await vectorizeDocument("titre", "contenu");
9-
expect(vector1).toBeDefined();
10-
// FIXME Should return the same result but don't. See with remi and fabien.
11-
// expect(vector1).toMatchSnapshot();
12-
13-
// preprocessing should make those embeddings equal
14-
// FIXME Should return the same result but don't. See with remi and fabien.
15-
// const vector2 = await vectorizeDocument("le titre", "et le contènu");
16-
// expect(vector2).toEqual(vector1);
17-
},
18-
timeout
19-
);
20-
21-
test(
22-
"Should vectorize query",
23-
async () => {
24-
// FIXME Résultat aléatoire, voir pourquoi on n'obtient pas toujours la même réponse
25-
// const vector1 = await vectorizeQuery("requete");
26-
// expect(vector1).toMatchSnapshot();
27-
// const vector2 = await vectorizeQuery("la requête");
28-
// expect(vector2).toEqual(vector1);
29-
},
30-
timeout
31-
);
32-
33-
test(
34-
"Should fail when no content passed",
35-
async () => {
36-
await expect(vectorizeQuery()).rejects.toThrow(
37-
new Error("Cannot vectorize empty query.")
38-
);
39-
},
40-
timeout
41-
);
1+
const { preprocess } = require("./index");
422

433
test("Should preprocess text", async () => {
444
expect(preprocess("à la nôtre")).toEqual("");

shared/types/src/elastic/tools.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ export type Tool = {
1717
source: string;
1818
text: string;
1919
title: string;
20-
title_vector: number[];
2120
_id: string;
2221
displayTool?: boolean;
2322
};

targets/export-elasticsearch/src/ingester/ingest.ts

Lines changed: 1 addition & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -6,72 +6,32 @@ import {
66
DOCUMENTS,
77
indexDocumentsBatched,
88
SUGGESTIONS,
9-
vectorizeDocument,
109
version,
1110
} from "@socialgouv/cdtn-elasticsearch";
1211
import { logger } from "@shared/utils";
13-
import { SOURCES } from "@socialgouv/cdtn-sources";
14-
import pMap from "p-map";
1512

1613
import { cdtnDocumentsGen } from "./cdtnDocuments";
1714
import { context } from "./context";
1815
import { populateSuggestions } from "./suggestion";
1916

20-
async function addVector(data: any) {
21-
const NLP_URL = context.get("nlpUrl");
22-
if (NLP_URL) {
23-
if (!data.title) {
24-
logger.error(`No title for document ${data.source} / ${data.slug}`);
25-
}
26-
const title = data.title || "sans titre";
27-
await vectorizeDocument(title, data.text)
28-
.then((title_vector: any) => {
29-
if (title_vector.message) {
30-
throw new Error(`error fetching message ${data.title}`);
31-
}
32-
data.title_vector = title_vector;
33-
})
34-
.catch((err: any) => {
35-
throw new Error(
36-
`Vectorization failed: ${data.id} (${data.title} - ${err.retryCount} retries)`
37-
);
38-
});
39-
}
40-
41-
return Promise.resolve(data);
42-
}
43-
44-
// these sources do not need NLP vectorization
45-
const excludeSources = [
46-
SOURCES.CDT,
47-
SOURCES.GLOSSARY,
48-
SOURCES.PREQUALIFIED,
49-
SOURCES.HIGHLIGHTS,
50-
SOURCES.SHEET_MT_PAGE,
51-
SOURCES.VERSIONS,
52-
];
53-
5417
export async function ingest(
5518
cdtnAdminEndpoint: string | undefined,
5619
cdtnAdminEndpointSecret: string | undefined,
5720
esUrl: string | undefined,
5821
esTokenIngest: string | undefined,
5922
esIndexPrefix: string | undefined,
60-
nlpUrl: string | undefined,
6123
suggestIndexName: string | undefined,
6224
bufferSize: number | undefined,
6325
suggestFile: string | undefined,
6426
isProd = false
6527
) {
6628
context.provide();
67-
process.env.NLP_URL = nlpUrl; //pour setter la variable d'environment du package elasticsearch...
6829
await runIngester(
6930
cdtnAdminEndpoint,
7031
cdtnAdminEndpointSecret,
7132
esUrl,
7233
esTokenIngest,
7334
esIndexPrefix,
74-
nlpUrl,
7535
suggestIndexName,
7636
bufferSize,
7737
suggestFile,
@@ -85,7 +45,6 @@ async function runIngester(
8545
esUrl: string | undefined,
8646
esTokenIngest: string | undefined,
8747
esIndexPrefix: string | undefined,
88-
nlpUrl: string | undefined,
8948
suggestIndexName: string | undefined,
9049
bufferSize: number | undefined,
9150
suggestFile: string | undefined,
@@ -119,16 +78,9 @@ async function runIngester(
11978
context.set("suggestIndexName", suggestIndexName);
12079
context.set("bufferSize", bufferSize);
12180
context.set("suggestFile", suggestFile);
122-
context.set("nlpUrl", nlpUrl);
12381
const ts = Date.now();
12482
logger.info(`Using cdtn elasticsearch ${ELASTICSEARCH_URL}`);
12583

126-
if (nlpUrl) {
127-
logger.info(`Using NLP service to retrieve tf vectors on ${nlpUrl}`);
128-
} else {
129-
logger.info(`NLP_URL not defined, semantic search will be disabled.`);
130-
}
131-
13284
await version({ client });
13385

13486
logger.info(`Creating index ${DOCUMENT_INDEX_NAME}-${ts}`);
@@ -142,18 +94,9 @@ async function runIngester(
14294
const updateDocs = async (source: string, documents: unknown[]) => {
14395
logger.info(`› ${source}... ${documents.length} items`);
14496

145-
let docs = documents;
146-
147-
// add NLP vectors
148-
if (!(excludeSources as string[]).includes(source)) {
149-
docs = await pMap(documents, addVector, {
150-
concurrency: 5,
151-
});
152-
}
153-
15497
await indexDocumentsBatched({
15598
client,
156-
documents: docs,
99+
documents,
157100
indexName: `${DOCUMENT_INDEX_NAME}-${ts}`,
158101
size: 800,
159102
});

targets/export-elasticsearch/src/workers/ingester-preprod.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ const ingester = async (): Promise<string> => {
1212
process.env.BRANCH_NAME_SLUG
1313
? `cdtn-${process.env.BRANCH_NAME_SLUG}`
1414
: process.env.ELASTICSEARCH_INDEX_PREPROD,
15-
process.env.NLP_PREPROD_DISABLE ? undefined : process.env.NLP_URL,
1615
undefined,
1716
undefined,
1817
undefined

targets/export-elasticsearch/src/workers/ingester-prod.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ const ingester = async (): Promise<string> => {
1212
process.env.BRANCH_NAME_SLUG
1313
? `cdtn-${process.env.BRANCH_NAME_SLUG}`
1414
: process.env.ELASTICSEARCH_INDEX_PROD,
15-
process.env.NLP_URL,
1615
undefined,
1716
undefined,
1817
undefined,

0 commit comments

Comments
 (0)