feat(search): remove usage of NLP to search document (#1426)

maxgfr · web-flow · commit 4f76e37d9d3a · 2024-06-24T09:53:52.000+02:00
* fix: byebye

* fix: merge

* fix: limit

* fix: limit

* fix: limit

* fix: limit

* fix: merge

* fix: doc
diff --git a/.kontinuous/env/dev/templates/export.configmap.yaml b/.kontinuous/env/dev/templates/export.configmap.yaml
@@ -4,10 +4,8 @@ metadata:
   name: export-elasticsearch
 data:
   HASURA_GRAPHQL_ENDPOINT: "http://hasura/v1/graphql"
-  NLP_URL: "https://serving-ml-preprod.ovh.fabrique.social.gouv.fr"
   NODE_ENV: "production"
   LOG_LEVEL: "info"
-  NLP_PREPROD_DISABLE: "true"
   BUCKET_DEFAULT_FOLDER: "default"
   BUCKET_DRAFT_FOLDER: "draft"
   BUCKET_PREVIEW_FOLDER: "preview"
@@ -20,3 +18,4 @@ data:
   ELASTICSEARCH_INDEX_PREPROD: "cdtn-dev-v2"
   ELASTICSEARCH_INDEX_PROD: "cdtn-dev-v2"
   MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille-dev"
+  DISABLE_LIMIT_EXPORT: "true"
diff --git a/.kontinuous/env/preprod/templates/export.configmap.yaml b/.kontinuous/env/preprod/templates/export.configmap.yaml
@@ -4,7 +4,6 @@ metadata:
   name: export-elasticsearch
 data:
   HASURA_GRAPHQL_ENDPOINT: "http://hasura/v1/graphql"
-  NLP_URL: "https://serving-ml.fabrique.social.gouv.fr"
   NODE_ENV: "production"
   LOG_LEVEL: "info"
   BUCKET_DEFAULT_FOLDER: "default"
@@ -19,3 +18,4 @@ data:
   ELASTICSEARCH_INDEX_PREPROD: "cdtn-main-v2"
   ELASTICSEARCH_INDEX_PROD: "cdtn-main-v2"
   MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille-preprod"
+  DISABLE_LIMIT_EXPORT: "true"
diff --git a/.kontinuous/env/prod/templates/export.configmap.yaml b/.kontinuous/env/prod/templates/export.configmap.yaml
@@ -4,7 +4,6 @@ metadata:
   name: export-elasticsearch
 data:
   HASURA_GRAPHQL_ENDPOINT: "http://hasura/v1/graphql"
-  NLP_URL: "https://serving-ml.fabrique.social.gouv.fr"
   NODE_ENV: "production"
   LOG_LEVEL: "info"
   BUCKET_DEFAULT_FOLDER: "default"
@@ -18,4 +17,4 @@ data:
   AGREEMENTS_DESTINATION_NAME: "index.json"
   ELASTICSEARCH_INDEX_PREPROD: "cdtn-preprod-v2"
   ELASTICSEARCH_INDEX_PROD: "cdtn-prod-v2"
-  MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille"
+  MATTERMOST_CHANNEL_EXPORT: "s-cdtn-administration-veille"
diff --git a/README.md b/README.md
@@ -157,9 +157,6 @@ DISABLE_LIMIT_EXPORT=true DISABLE_AGREEMENTS=true DISABLE_SITEMAP=true HASURA_GR
 - `DISABLE_COPY` is used to disable copy between two containers
 - `DISABLE_SITEMAP` is used to disable copy of the sitemap
 - `DISABLE_AGREEMENTS` is used to disable copy of the agreements
-- `NLP_URL` could be set by `https://serving-ml-preprod.ovh.fabrique.social.gouv.fr`, by default it is `undefined`
-
-> **Note**: You can remove `NLP_URL` from your environment variables if you don't want to use the NLP service and gain time during the process of ingester elasticsearch.
 
 #### 6. Run the export elasticsearch
 
@@ -178,7 +175,7 @@ yarn workspace frontend dev
 #### On client
 
 ```sh
-NLP_URL=https://serving-ml-preprod.ovh.fabrique.social.gouv.fr yarn workspace @cdt/frontend dev
+yarn workspace @cdt/frontend dev
 ```
 
 1. Go to `http://localhost:3001/`
@@ -292,22 +289,6 @@ Cela permet de lier l'index elasticsearch automatiquement entre les deux branche
 
 L'export des données se fait depuis l'admin dans la section `Contenus > Mise à jour`. Il faut ensuite cliquer sur le bouton `Mettre à jour la pre-production`.
 
-> Note: Le glossary (injection des tooltips) et le NLP (vectorisation des données) sont par défaut désactivé en dev.
-
-#### Activer le glossary et le NLP
-
-Il faut commencer par donner les ressources nécessaires au processus dans l'environnement de dev :
-
-- Ouvrir le fichier `.kontinous/env/dev/values.yaml`
-- Appliquer ce que les commentaires indiquent pour les ressources sur hasura et export
-
-L'export des données se fait depuis l'admin dans la section `Contenus > Mise à jour`. Il faut ensuite cliquer sur le bouton `Mettre à jour la production`.
-
-<strong>/!\ /!\ /!\ ATTENTION /!\ /!\ /!\ : Bien penser à remettre les lignes en commentaire avant de merger dans master !</strong>
-
-> Pourquoi changer les ressources ?
-> L'export avec glossary et NLP est un processus qui demande beaucoup de RAM/CPU. Afin de ne pas surcharger le cluster de dev, on ne va pas demander ces ressources car l'export est peu utilisé pour les tests. Il n'existe aucun mécanisme sur la CI à l'heure actuelle pour permettre de faire le switch autrement.
-
 ### Limitations connues
 
 - Les fichiers du site sont stockés au même endroit pour l'ensemble des branches. Si on ajoute/modifie/supprime un fichier, cela sera également le cas sur l'ensemble des branches
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -129,8 +129,8 @@ services:
       HASURA_GRAPHQL_ENDPOINT: "http://hasura:8080/v1/graphql"
       DISABLE_COPY: "true"
       DISABLE_SITEMAP: "true"
+      DISABLE_LIMIT_EXPORT: "true"
       DISABLE_AGREEMENTS: "true"
-      NLP_URL: "https://serving-ml-preprod.ovh.fabrique.social.gouv.fr"
       ELASTICSEARCH_INDEX_PREPROD: "cdtn-preprod-v1"
       ELASTICSEARCH_INDEX_PROD: "cdtn-v1"
       ELASTICSEARCH_URL_PREPROD: "http://elasticsearch:9200"
diff --git a/shared/elasticsearch/package.json b/shared/elasticsearch/package.json
@@ -7,9 +7,6 @@
       "@babel/plugin-transform-modules-commonjs"
     ]
   },
-  "dependencies": {
-    "got": "^11.8.2"
-  },
   "license": "Apache-2.0",
   "main": "src/index.js",
   "publishConfig": {
diff --git a/shared/elasticsearch/src/index.d.ts b/shared/elasticsearch/src/index.d.ts
@@ -4,7 +4,5 @@ export const documentMapping: any;
 export const DOCUMENTS: string;
 export const indexDocumentsBatched: any;
 export const SUGGESTIONS: string;
-export const vectorizeDocument: any;
 export const version: any;
 export const suggestionMapping: any;
-export const vectorizeQuery: any;
diff --git a/shared/elasticsearch/src/mapping/document.mapping.js b/shared/elasticsearch/src/mapping/document.mapping.js
@@ -236,14 +236,11 @@ exports.documentMapping = {
       type: "text",
     },
 
-    title_vector: {
-      dims: 512,
-      type: "dense_vector",
-    },
     // The source URL
     url: {
       type: "keyword",
     },
+
     // used in prequalifieds
     variants: {
       type: "text",
diff --git a/shared/elasticsearch/src/vectorizer/index.js b/shared/elasticsearch/src/vectorizer/index.js
@@ -1,23 +1,12 @@
-// vectorizer is imported by code-du-travail-api which is using CommonJS, and throwing an exception
-// when requiring code-du-travail-data ES module, thus we keep using CommonJS import here
-const got = require("got");
 const { stopwords: semantic_stopwords } = require("../dataset/stop_words");
 
-// URL of the TF serve deployment
-const NLP_URL =
-  process.env.NLP_URL || "https://serving-ml.fabrique.social.gouv.fr";
-console.log("NLP URL:", NLP_URL);
-const tfServeURL = NLP_URL + "/v1/models/sentqam:predict";
-
 function stripAccents(text) {
   // strip accents
   return text.normalize("NFD").replace(/[\u0300-\u036f]/g, "");
 }
 
 const stopWords = new Set(semantic_stopwords.map(stripAccents));
 
-const cache = new Map();
-
 function preprocess(text) {
   const stripped = stripAccents(text);
 
@@ -31,48 +20,4 @@ function preprocess(text) {
   return noStopWords.join(" ");
 }
 
-async function callTFServe(json) {
-  const response = await got.post(tfServeURL, {
-    cache,
-    json,
-    responseType: "json",
-    retry: {
-      limit: 15,
-      methods: ["POST"],
-    },
-  });
-  return response.body["outputs"];
-}
-
-async function vectorizeDocument(title, content) {
-  if (title == undefined || title == "") {
-    throw new Error("Cannot vectorize document with empty title.");
-  }
-
-  const input = [preprocess(title)];
-  const context = content ? [preprocess(content)] : "";
-
-  const body = {
-    inputs: { context, input },
-    signature_name: "response_encoder",
-  };
-  const vectors = await callTFServe(body);
-
-  return vectors[0];
-}
-
-async function vectorizeQuery(query) {
-  if (!query) {
-    throw new Error("Cannot vectorize empty query.");
-  }
-
-  const inputs = [preprocess(query)];
-  const body = {
-    inputs,
-    signature_name: "question_encoder",
-  };
-  const vectors = await callTFServe(body);
-  return vectors[0];
-}
-
-module.exports = { preprocess, vectorizeDocument, vectorizeQuery };
+module.exports = { preprocess };
diff --git a/shared/elasticsearch/src/vectorizer/index.test.js b/shared/elasticsearch/src/vectorizer/index.test.js
@@ -1,44 +1,4 @@
-const { vectorizeDocument, vectorizeQuery, preprocess } = require("./index");
-
-const timeout = 10000;
-
-test(
-  "Should vectorize document",
-  async () => {
-    const vector1 = await vectorizeDocument("titre", "contenu");
-    expect(vector1).toBeDefined();
-    // FIXME Should return the same result but don't. See with remi and fabien.
-    // expect(vector1).toMatchSnapshot();
-
-    // preprocessing should make those embeddings equal
-    // FIXME Should return the same result but don't. See with remi and fabien.
-    // const vector2 = await vectorizeDocument("le titre", "et le contènu");
-    // expect(vector2).toEqual(vector1);
-  },
-  timeout
-);
-
-test(
-  "Should vectorize query",
-  async () => {
-    // FIXME Résultat aléatoire, voir pourquoi on n'obtient pas toujours la même réponse
-    // const vector1 = await vectorizeQuery("requete");
-    // expect(vector1).toMatchSnapshot();
-    // const vector2 = await vectorizeQuery("la requête");
-    // expect(vector2).toEqual(vector1);
-  },
-  timeout
-);
-
-test(
-  "Should fail when no content passed",
-  async () => {
-    await expect(vectorizeQuery()).rejects.toThrow(
-      new Error("Cannot vectorize empty query.")
-    );
-  },
-  timeout
-);
+const { preprocess } = require("./index");
 
 test("Should preprocess text", async () => {
   expect(preprocess("à la nôtre")).toEqual("");
diff --git a/shared/types/src/elastic/tools.ts b/shared/types/src/elastic/tools.ts
@@ -17,7 +17,6 @@ export type Tool = {
   source: string;
   text: string;
   title: string;
-  title_vector: number[];
   _id: string;
   displayTool?: boolean;
 };
diff --git a/targets/export-elasticsearch/src/ingester/ingest.ts b/targets/export-elasticsearch/src/ingester/ingest.ts
@@ -6,72 +6,32 @@ import {
   DOCUMENTS,
   indexDocumentsBatched,
   SUGGESTIONS,
-  vectorizeDocument,
   version,
 } from "@socialgouv/cdtn-elasticsearch";
 import { logger } from "@shared/utils";
-import { SOURCES } from "@socialgouv/cdtn-sources";
-import pMap from "p-map";
 
 import { cdtnDocumentsGen } from "./cdtnDocuments";
 import { context } from "./context";
 import { populateSuggestions } from "./suggestion";
 
-async function addVector(data: any) {
-  const NLP_URL = context.get("nlpUrl");
-  if (NLP_URL) {
-    if (!data.title) {
-      logger.error(`No title for document ${data.source} / ${data.slug}`);
-    }
-    const title = data.title || "sans titre";
-    await vectorizeDocument(title, data.text)
-      .then((title_vector: any) => {
-        if (title_vector.message) {
-          throw new Error(`error fetching message ${data.title}`);
-        }
-        data.title_vector = title_vector;
-      })
-      .catch((err: any) => {
-        throw new Error(
-          `Vectorization failed: ${data.id} (${data.title} - ${err.retryCount} retries)`
-        );
-      });
-  }
-
-  return Promise.resolve(data);
-}
-
-// these sources do not need NLP vectorization
-const excludeSources = [
-  SOURCES.CDT,
-  SOURCES.GLOSSARY,
-  SOURCES.PREQUALIFIED,
-  SOURCES.HIGHLIGHTS,
-  SOURCES.SHEET_MT_PAGE,
-  SOURCES.VERSIONS,
-];
-
 export async function ingest(
   cdtnAdminEndpoint: string | undefined,
   cdtnAdminEndpointSecret: string | undefined,
   esUrl: string | undefined,
   esTokenIngest: string | undefined,
   esIndexPrefix: string | undefined,
-  nlpUrl: string | undefined,
   suggestIndexName: string | undefined,
   bufferSize: number | undefined,
   suggestFile: string | undefined,
   isProd = false
 ) {
   context.provide();
-  process.env.NLP_URL = nlpUrl; //pour setter la variable d'environment du package elasticsearch...
   await runIngester(
     cdtnAdminEndpoint,
     cdtnAdminEndpointSecret,
     esUrl,
     esTokenIngest,
     esIndexPrefix,
-    nlpUrl,
     suggestIndexName,
     bufferSize,
     suggestFile,
@@ -85,7 +45,6 @@ async function runIngester(
   esUrl: string | undefined,
   esTokenIngest: string | undefined,
   esIndexPrefix: string | undefined,
-  nlpUrl: string | undefined,
   suggestIndexName: string | undefined,
   bufferSize: number | undefined,
   suggestFile: string | undefined,
@@ -119,16 +78,9 @@ async function runIngester(
   context.set("suggestIndexName", suggestIndexName);
   context.set("bufferSize", bufferSize);
   context.set("suggestFile", suggestFile);
-  context.set("nlpUrl", nlpUrl);
   const ts = Date.now();
   logger.info(`Using cdtn elasticsearch ${ELASTICSEARCH_URL}`);
 
-  if (nlpUrl) {
-    logger.info(`Using NLP service to retrieve tf vectors on ${nlpUrl}`);
-  } else {
-    logger.info(`NLP_URL not defined, semantic search will be disabled.`);
-  }
-
   await version({ client });
 
   logger.info(`Creating index ${DOCUMENT_INDEX_NAME}-${ts}`);
@@ -142,18 +94,9 @@ async function runIngester(
   const updateDocs = async (source: string, documents: unknown[]) => {
     logger.info(`› ${source}... ${documents.length} items`);
 
-    let docs = documents;
-
-    // add NLP vectors
-    if (!(excludeSources as string[]).includes(source)) {
-      docs = await pMap(documents, addVector, {
-        concurrency: 5,
-      });
-    }
-
     await indexDocumentsBatched({
       client,
-      documents: docs,
+      documents,
       indexName: `${DOCUMENT_INDEX_NAME}-${ts}`,
       size: 800,
     });
diff --git a/targets/export-elasticsearch/src/workers/ingester-preprod.ts b/targets/export-elasticsearch/src/workers/ingester-preprod.ts
@@ -12,7 +12,6 @@ const ingester = async (): Promise<string> => {
         process.env.BRANCH_NAME_SLUG
           ? `cdtn-${process.env.BRANCH_NAME_SLUG}`
           : process.env.ELASTICSEARCH_INDEX_PREPROD,
-        process.env.NLP_PREPROD_DISABLE ? undefined : process.env.NLP_URL,
         undefined,
         undefined,
         undefined
diff --git a/targets/export-elasticsearch/src/workers/ingester-prod.ts b/targets/export-elasticsearch/src/workers/ingester-prod.ts
@@ -12,7 +12,6 @@ const ingester = async (): Promise<string> => {
         process.env.BRANCH_NAME_SLUG
           ? `cdtn-${process.env.BRANCH_NAME_SLUG}`
           : process.env.ELASTICSEARCH_INDEX_PROD,
-        process.env.NLP_URL,
         undefined,
         undefined,
         undefined,
diff --git a/yarn.lock b/yarn.lock