diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bddba0548c..63ef139def 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,13 +11,13 @@ env:
PYTHON_VERSION: "3.9"
MARIADB_VERSION: "10.4.10"
COVERALLS_VERSION: "3.3.1" # check if Coverage needs to be also updated in requirements-ci.txt
+ TYPESENSE_VERSION: "27.0" # needs to be also updated in scripts/define_variable.sh
# As GitHub Action does not allow environment variables
# to be used in services definitions, these are only for
# reference. If you update these versions, you HAVE TO
# update the versions in the services definitions of the
# test job.
- ELASTICSEARCH_VERSION: "5.5.2"
MEMCACHED_VERSION: "1.6"
jobs:
@@ -132,7 +132,7 @@ jobs:
# Test the zds-site project.
# Install the project, using assets created during the previous job,
- # and install elasticsearch & memcache as a service. Then, run the tests
+ # and install Typesense & Memcached as a service. Then, run the tests
# in a matrix build to parallelize multiple components.
test:
name: Install and test zds-site
@@ -144,27 +144,11 @@ jobs:
module:
[
"zds.tutorialv2",
- "zds.member zds.gallery zds.searchv2 zds.middlewares zds.pages",
+ "zds.member zds.gallery zds.search zds.middlewares zds.pages",
"zds.forum zds.featured zds.mp zds.notification zds.utils",
]
services:
- elasticsearch:
- image: "elasticsearch:5.5.2"
- ports:
- - "9200:9200"
- env:
- "http.host": "0.0.0.0"
- "transport.host": "127.0.0.1"
- "xpack.security.enabled": false
- "ES_JAVA_OPTS": "-Xms512m -Xmx512m"
- options: >-
- -e="discovery.type=single-node"
- --health-cmd="curl http://localhost:9200/_cluster/health"
- --health-interval=10s
- --health-timeout=5s
- --health-retries=10
-
memcached:
image: "memcached:1.6"
ports:
@@ -183,6 +167,12 @@ jobs:
mysql database: "ci_db_name"
mysql root password: "ci_root_password"
+ - name: Start Typesense
+ uses: jirevwe/typesense-github-action@v1.0.1
+ with:
+ typesense-version: ${{ env.TYPESENSE_VERSION }}
+ typesense-api-key: xyz
+
- name: Checkout
uses: actions/checkout@v4
diff --git a/Makefile b/Makefile
index f8349682ef..59fc204670 100644
--- a/Makefile
+++ b/Makefile
@@ -99,16 +99,16 @@ zmd-stop: ## Stop the zmarkdown server
node ./zmd/node_modules/pm2/bin/pm2 kill
##
-## ~ Elastic Search
+## ~ Search Engine
-run-elasticsearch: ## Run the Elastic Search server
- elasticsearch || echo 'No Elastic Search installed (you can add it locally with `./scripts/install_zds.sh +elastic-local`)'
+run-search-engine: ## Run the search server
+ ./.local/typesense/typesense-server --data-dir=.local/typesense/typesense-data --api-key=xyz || echo 'No Typesense installed (you can add it locally with `./scripts/install_zds.sh +typesense-local`)'
-index-all: ## Index the database in a new Elastic Search index
- python manage.py es_manager index_all
+index-all: ## Index the whole database in the search engine
+ python manage.py search_engine_manager index_all
-index-flagged: ## Index the database in the current Elastic Search index
- python manage.py es_manager index_flagged
+index-flagged: ## Index new content in the search engine
+ python manage.py search_engine_manager index_flagged
##
## ~ PDF
diff --git a/assets/scss/base/_base.scss b/assets/scss/base/_base.scss
index f4f4f52441..a5676bbc59 100644
--- a/assets/scss/base/_base.scss
+++ b/assets/scss/base/_base.scss
@@ -138,6 +138,10 @@ nav {
}
}
+.align-center {
+ text-align: center;
+}
+
@include desktop {
body {
min-height: 100%;
diff --git a/doc/source/back-end-code/searchv2.rst b/doc/source/back-end-code/search.rst
similarity index 61%
rename from doc/source/back-end-code/searchv2.rst
rename to doc/source/back-end-code/search.rst
index a31835f3f3..225dd48a4f 100644
--- a/doc/source/back-end-code/searchv2.rst
+++ b/doc/source/back-end-code/search.rst
@@ -1,19 +1,19 @@
============================
-La recherche (``searchv2/``)
+La recherche (``search/``)
============================
-Module situé dans ``zds/searchv2/``.
+Module situé dans ``zds/search/``.
.. contents:: Fichiers documentés :
Modèles (``models.py``)
=======================
-.. automodule:: zds.searchv2.models
+.. automodule:: zds.search.models
:members:
Vues (``views.py``)
===================
-.. automodule:: zds.searchv2.views
+.. automodule:: zds.search.views
:members:
diff --git a/doc/source/back-end/search.rst b/doc/source/back-end/search.rst
new file mode 100644
index 0000000000..110244e099
--- /dev/null
+++ b/doc/source/back-end/search.rst
@@ -0,0 +1,394 @@
+============
+La recherche
+============
+
+Principe
+========
+
+Comment faire une recherche ?
+-----------------------------
+
+La recherche se découpe en deux parties distinctes :
+
+ - L'indexation des données
+ - La recherche par l'utilisateur
+
+L'indexation des données
+++++++++++++++++++++++++
+
+**L'indexation** des données consiste à **rassembler toutes les données** dans
+lesquelles l'utilisateur va **pouvoir rechercher**. Elle est faite au
+préalable. Celle-ci est faite de telle façon qu'on puisse rechercher dans les
+éléments suivants :
+
+ - Les contenus (article, tutoriels et billets) ainsi que leurs chapitres (s'il
+ s'agit d'un moyen ou *big*-tuto) ;
+ - Les sujets ;
+ - Les réponses aux sujets.
+
+Cette indexation est réalisée à intervalle régulier (et de manière à n'indexer
+que les données qui ont changé).
+
+La recherche
+++++++++++++
+
+L'utilisateur peut utiliser la recherche, en utilisant la recherche de
+`l'en-tête <../front-end/structure-du-site.html#l-en-tete>`_, ou par la page
+d'accueil, si elle est disponible.
+
+ .. figure:: ../images/design/en-tete.png
+ :align: center
+
+Des critères de recherche peuvent être ajoutés sur la page de recherche. Le
+seul critère de recherche disponible actuellement est le type de résultat
+(contenu, sujet du forum ou message du forum).
+
+ .. figure:: ../images/search/search-filters.png
+ :align: center
+
+Quelques mots sur Typesense
+-------------------------------
+
+`Typesense `_ est un moteur de recherche qui permet
+d’indexer et de rechercher des données. Typesense offre une interface de type
+REST pour interroger son index, mais nous utilisons plutôt le module Python
+dédié.
+
+Phase d'indexation
+++++++++++++++++++
+
+Typesense organise les données sous forme de documents, regroupés dans des
+collections. On peut avoir différent types de collections (par exemple pour
+Zeste de Savoir : *topics*, *posts*, contenus, chapitres, etc).
+
+La phase d'indexation est réalisée à l'aide de la commande ``python manage.py
+search_engine_manager`` (voir ci-dessous).
+
+Phase de recherche
+++++++++++++++++++
+
+Durant la phase de recherche, les documents sont classés par ``text_match``,
+valeur qui représente le score de correspondance avec le texte recherché. Ce
+score dépend des champs que l'on souhaite indexer, il est calculé selon
+plusieurs métriques :
+
++ *Fréquence* : elle correspond au nombre de fois qu’un terme apparaît dans un
+ document ;
++ *Distance d'édition* : si un terme de la requête n'est pas trouvé dans les
+ documents, Typesense recherchera des mots qui diffèrent de la requête d'un
+ certain nombre de caractères (``num_typos``) en ajoutant, supprimant ou
+ remplaçant des caractères ;
++ *Proximité* : si la requête est constituée de plusieurs termes et que ces
+ termes sont proches alors le score sera plus élevé. Par exemple, si la
+ requête est "moteur de recherche". Le titre *Typesense est un moteur de
+ recherche* aura un meilleur score que le titre *La recherche d'un nouveau
+ moteur thermique à pistons rotatifs* ;
++ *Ordre des champs* : si on a indiqué qu'on recherche selon les champs *titre*
+ et *description* (dans cet ordre), alors le score sera plus important si le
+ terme est trouvé dans le champ *titre* ;
++ *Pondération des champs* : si un document possède un champ *titre* et un
+ champ *description*, alors avec des poids supérieur pour le champ *titre*, le
+ score sera plus élevé si le terme est trouvé dans le titre.
+
+Les différents poids sont modifiables directement dans les paramètres de Zeste
+de Savoir (voir ci-dessous).
+
+Il est possible de rechercher dans plusieurs collections en une seule requête,
+avec un mécanisme que Typesense appele le `Federated Multi-Search
+`_.
+
+En pratique
+===========
+
+Configuration
+-------------
+
+La configuration de la connexion se fait dans le fichier
+``settings/abstract_base/zds.py``, à l'aide des deux variables suivantes :
+
+.. sourcecode:: python
+
+ SEARCH_ENABLED = True
+
+ SEARCH_CONNECTION = {
+ "nodes": [
+ {
+ "host": "localhost",
+ "port": "8108",
+ "protocol": "http",
+ }
+ ],
+ "api_key": "xyz",
+ "connection_timeout_seconds": 2,
+ }
+
+
+La première active le moteur de recherche, la seconde permet de configurer la
+connexion au moteur de recherche.
+
+Pour indiquer, les poids associés à chacune des collections, il faut modifier
+les variables suivantes dans ``settings/abstract_base/zds.py`` :
+
+.. sourcecode:: python
+
+ global_weight_publishedcontent = 3 # contenus publiés (billets, tutoriaux, articles)
+ global_weight_topic = 2 # sujets de forum
+ global_weight_chapter = 1.5 # chapitres
+ global_weight_post = 1 # messages d'un sujet de forum
+
+
+Il est possible de modifier les différents paramètres de la recherche dans
+``settings/abstract_base/zds.py`` :
+
+.. sourcecode:: python
+
+ "search": {
+ "mark_keywords": ["javafx", "haskell", "groovy", "powershell", "latex", "linux", "windows"],
+ "results_per_page": 20,
+ "search_groups": {
+ "publishedcontent": (_("Contenus publiés"), ["publishedcontent", "chapter"]),
+ "topic": (_("Sujets du forum"), ["topic"]),
+ "post": (_("Messages du forum"), ["post"]),
+ },
+ "search_content_type": {
+ "tutorial": (_("Tutoriels"), ["tutorial"]),
+ "article": (_("Articles"), ["article"]),
+ "opinion": (_("Billet"), ["opinion"]),
+ },
+ "search_validated_content": {
+ "validated": (_("Contenus validés"), ["validated"]),
+ "no_validated": (_("Contenus libres"), ["no_validated"]),
+ },
+ "boosts": {
+ "publishedcontent": {
+ "global": global_weight_publishedcontent,
+ "if_article": 2.0, # s'il s'agit d'un article
+ "if_tutorial": 2.0, # s'il s'agit d'un tuto
+ "if_medium_or_big_tutorial": 2.5, # s'il s'agit d'un tuto d'une taille plutôt importante
+ "if_opinion": 1.66, # s'il s'agit d'un billet
+ "if_opinion_not_picked": 1.5, # s'il s'agit d'un billet pas mis en avant
+
+ # poids des différents champs :
+ "title": global_weight_publishedcontent * 3,
+ "description": global_weight_publishedcontent * 2,
+ "categories": global_weight_publishedcontent * 1,
+ "subcategories": global_weight_publishedcontent * 1,
+ "tags": global_weight_publishedcontent * 1,
+ "text": global_weight_publishedcontent * 2,
+ },
+ "topic": {
+ "global": global_weight_topic,
+ "if_solved": 1.1, # s'il s'agit d'un sujet résolu
+ "if_sticky": 1.2, # s'il s'agit d'un sujet épinglé
+ "if_locked": 0.1, # s'il s'agit d'un sujet fermé
+
+ # poids des différents champs :
+ "title": global_weight_topic * 3,
+ "subtitle": global_weight_topic * 2,
+ "tags": global_weight_topic * 1,
+ },
+ "chapter": {
+ "global": global_weight_chapter,
+
+ # poids des différents champs :
+ "title": global_weight_chapter * 3,
+ "text": global_weight_chapter * 2,
+ },
+ "post": {
+ "global": global_weight_post,
+ "if_first": 1.2, # s'il s'agit d'un message en première position
+ "if_useful": 1.5, # s'il s'agit d'un message jugé utile
+ "ld_ratio_above_1": 1.05, # si le ratio pouce vert/rouge est supérieur à 1
+ "ld_ratio_below_1": 0.95, # si le ratio pouce vert/rouge est inférieur à 1
+ "text_html": global_weight_post, # poids du champ
+ },
+ },
+
+
++ ``results_per_page`` est le nombre de résultats affichés,
++ ``search_groups`` définit les différents types de documents indexés et la
+ manière dont ils sont groupés sur le formulaire de recherche,
++ ``search_content_type`` définit les différents types de contenus publiés et
+ la manière dont ils sont groupés sur le formulaire de recherche,
++ ``search_validated_content`` définit les différentes validations des contenus
+ publiés et la manière dont elles sont groupées sur le formulaire de recherche,
++ ``boosts`` contient les différents facteurs de *boost* appliqués aux
+ différentes situations. Modifier ces valeurs permet de changer l'ordre des
+ résultats retourés lors d'une recherche.
+
+
+Indexer les données
+-------------------
+
+Une fois Typesense `installé <../install/extra-install-search-engine.html>`_, configuré et lancé, la commande suivante est utilisée :
+
+.. sourcecode:: bash
+
+ python manage.py search_engine_manager
+
+où ```` peut être :
+
++ ``setup`` : crée et configure le *client* Typesense (y compris la création des
+ *collections* avec *schémas*) ;
++ ``clear`` : supprime toutes les *collections* du *client* Typesense et marque
+ toutes les données comme "à indexer" ;
++ ``index_flagged`` : indexe les données marquées comme "à indexer" ;
++ ``index_all`` : invoque ``setup`` puis indexe toute les données (qu'elles
+ soient marquées comme "à indexer" ou non).
+
+
+La commande ``index_flagged`` peut donc être lancée de manière régulière afin
+d'indexer les nouvelles données ou les données modifiées.
+
+.. note::
+
+ Le caractère "à indexer" est fonction des actions effectuées sur l'objet
+ Django (par défaut, à chaque fois que la méthode ``save()`` du modèle est
+ appelée, l'objet est marqué comme "à indexer").
+ Cette information est stockée dans la base de donnée MySQL.
+
+Aspects techniques
+==================
+
+Indexation d'un modèle
+----------------------
+
+
+Afin d'être indexable, un modèle Django doit dériver de
+``AbstractSearchIndexableModel`` (qui dérive de ``models.Model`` et de
+``AbstractSearchIndexable``). Par exemple :
+
+.. sourcecode:: python
+
+ class Post(Comment, AbstractSearchIndexableModel):
+ # ...
+
+
+.. note::
+
+ Le code est écrit de manière à ce que l'id utilisé par Typesense (champ
+ ``id``) corresponde à la *pk* du modèle (via la variable
+ ``search_engine_id``). De cette façon, si on en connait la *pk* d'un objet
+ Django, il est possible de récupérer l'objet Typesense correspondant à
+ l'aide de ``GET /collections//documents/``.
+
+Différentes méthodes de la classe ``AbstractSearchIndexableModel`` peuvent ou
+doivent ensuite être surchargées :
+
++ ``get_search_document_schema()`` permet de définir le *schéma* d'un document, c'est
+ à dire quels champs seront indexés avec quels types. Par exemple :
+
+ .. sourcecode:: python
+
+ @classmethod
+ def get_search_document_schema(cls):
+ search_engine_schema = super().get_search_document_schema()
+
+ search_engine_schema["fields"] = [
+ {"name": "topic_pk", "type": "int64"},
+ {"name": "forum_pk", "type": "int64"},
+ {"name": "topic_title", "type": "string", "facet": True},
+ # ...
+
+ Les schémas Typesense sont des `dictionnaires
+ `_.
+ On indique également dans les schémas un poids de recherche qui est
+ calculé selon différent critères, ce champ correspond au boost que reçoit
+ le contenu lors de la phase de recherche.
+
++ ``get_indexable_objects`` permet de définir quels objets doivent être
+ récupérés et indexés. Cette fonction permet également d'utiliser
+ ``prefetch_related()`` ou ``select_related()`` pour minimiser le nombre de
+ requêtes SQL. Par exemple :
+
+ .. sourcecode:: python
+
+ @classmethod
+ def get_indexable_objects(cls, force_reindexing=False):
+ q = super(Post, cls).get_indexable_objects(force_reindexing)\
+ .prefetch_related('topic')\
+ .prefetch_related('topic__forum')
+
+ où ``q`` est un *queryset* Django.
+
++ ``get_document_source()`` permet de gérer des cas où le champ n'est pas
+ directement une propriété de la classe, ou si cette propriété ne peut pas
+ être indexée directement :
+
+ .. sourcecode:: python
+
+ def get_document_source(self, excluded_fields=None):
+ excluded_fields = excluded_fields or []
+ excluded_fields.extend(["tags", "forum_pk", "forum_title", "forum_get_absolute_url", "pubdate", "weight"])
+
+ data = super().get_document_source(excluded_fields=excluded_fields)
+ data["tags"] = [tag.title for tag in self.tags.all()]
+ data["forum_pk"] = self.forum.pk
+ data["forum_title"] = self.forum.title
+ data["forum_get_absolute_url"] = self.forum.get_absolute_url()
+ data["pubdate"] = date_to_timestamp_int(self.pubdate)
+ data["text"] = clean_html(self.text_html)
+ data["weight"] = self._compute_search_weight()
+
+ return data
+
+ Dans cet exemple (issu de la classe ``Post``), on voit que certains
+ champs ne peuvent être directement indexés car ils appartiennent au
+ *topic* et au *forum* parent. Il sont donc exclus du mécanisme par défaut
+ (via la variable ``excluded_fields``) et leur valeur est récupérée et
+ définie dans la suite de la méthode.
+
+ Cet exemple permet également de remarquer que le contenu indéxé ne
+ contient jamais de balises HTML (c'est le rôle de la fonction
+ ``clean_html()``). Il est ainsi possible d'afficher de façon sûre le
+ contenu renvoyé par Typesense (utile en particulier pour afficher les
+ balises ```` pour surligner les termes recherchés).
+
+
+Finalement, il est important **pour chaque type de document** d'attraper le
+signal de pré-suppression en base de données, afin que le document soit
+également supprimé du moteur de recherche.
+
+Plus d'informations sur les méthodes qui peuvent être surchargées sont
+disponibles `dans la documentation technique
+<../back-end-code/search.html>`_.
+
+.. attention::
+
+ À chaque fois que vous modifiez la définition d'un schéma d'une
+ collection dans ``get_search_document_schema()``, toutes les données doivent
+ être réindexées.
+
+Le cas particulier des contenus
+-------------------------------
+
+La plupart des informations des contenus, en particulier les textes, `ne sont
+pas stockés dans la base de données
+`_.
+
+Il a été choisi de n'inclure dans le moteur de recherche que les chapitres de
+ces contenus (anciennement, les introductions et conclusions des parties
+étaient également incluses). Ce sont les contenus HTML qui sont indexés et non
+leur version écrite en Markdown, afin de rester cohérent avec ce qui se fait
+pour les *posts*. Les avantages de cette décision sont multiples :
+
++ Le *parsing* est déjà effectué et n'a pas à être refait durant l'indexation ;
++ Moins de fichiers à lire (pour rappel, les différentes parties d'un contenu
+ `sont rassemblées en un seul fichier
+ `_ à la publication) ;
++ Pas besoin d'utiliser Git durant le processus d'indexation ;
+
+
+L'indexation des chapitres (représentés par la classe ``FakeChapter``, `voir
+ici
+<../back-end-code/tutorialv2.html#zds.tutorialv2.models.database.FakeChapter>`_)
+est effectuée en même temps que l'indexation des contenus publiés
+(``PublishedContent``). En particulier, c'est la méthode ``get_indexable()``
+qui est surchargée, profitant du fait que cette méthode peut renvoyer n'importe
+quel type de document à indexer.
+
+Le code tient aussi compte du fait que la classe ``PublishedContent`` `gère le
+changement de slug `_ afin de
+maintenir le SEO. Ainsi, la méthode ``save()`` est modifiée de manière à
+supprimer toute référence à elle même et aux chapitres correspondants si un
+objet correspondant au même contenu mais avec un nouveau slug est créé.
diff --git a/doc/source/back-end/searchv2.rst b/doc/source/back-end/searchv2.rst
deleted file mode 100644
index 4585581806..0000000000
--- a/doc/source/back-end/searchv2.rst
+++ /dev/null
@@ -1,362 +0,0 @@
-============
-La recherche
-============
-
-Principe
-========
-
-Comment faire une recherche ?
------------------------------
-
-La recherche se découpe en deux parties distinctes :
-
- - L'indexation des données
- - La recherche par l'utilisateur
-
-L'indexation des données
-++++++++++++++++++++++++
-
-**L'indexation** des données consiste à **rassembler toutes les données** dans lesquelles l'utilisateur va **pouvoir rechercher**. Elle est faite au préalable.
-Celle-ci est faite de telle façon qu'on puisse rechercher dans les éléments suivants :
-
- - Les contenus (article et tutoriels) ainsi que leurs chapitres (s'il s'agit d'un moyen ou *big*-tuto);
- - Les sujets ;
- - Les réponses aux sujets.
-
-Cette indexation est réalisée à intervalle régulier (et de manière à n'indexer que les données qui ont changées).
-
-La recherche
-++++++++++++
-
-L'utilisateur peut utiliser la recherche, en utilisant la recherche de `l'en-tête <../front-end/structure-du-site.html#l-en-tete>`_, ou par la page d'accueil, si elle est disponible.
-
- .. figure:: ../images/design/en-tete.png
- :align: center
-
-Des critères de recherche peuvent être ajoutés sur la page de recherche.
-Le seul critère de recherche disponible actuellement est le type de résultat (contenu, sujet du forum ou message du forum).
-
- .. figure:: ../images/search/search-filters.png
- :align: center
-
-Quelques mots sur Elasticsearch
--------------------------------
-
-`Elasticsearch `_ (ES) est un serveur utilisant `Lucene `_ (bibliothèque d'indexation et de recherche de texte) et permet d'indexer et de rechercher des données.
-Il est possible de l'interroger à travers une interface de type REST à laquelle on communique via des requêtes écrites en JSON.
-Ce projet propose également des API `bas `_ et `plus haut `_ niveau en python pour interagir avec le serveur, maintenues par l'équipe d'Elasticsearch.
-
-Précédemment, ZdS utilisait `Haystack `_ pour communiquer avec `Solr `_ (équivalent à Elasticsearch) mais ces solutions ont été abandonnées par manque d'activité sur le dépôt de Haystack.
-
-Phase d'indexation
-++++++++++++++++++
-
-ES classe ses données sous forme de documents, rassemblés dans un *index*. On peut avoir différent types de documents (*topics*, *posts*, contenus, chapitres dans ce cas-ci).
-
-Lorsque les documents sont indexés, ils sont analysés afin d'en extraire les termes importants et de les simplifier (par défaut, "table" et "tables" ne sont pas le même mot, mais il est possible de faire en sorte que si).
-Ce processus est effectué par l'*analyzer*, découpé en trois étapes:
-
-.. sourcecode:: none
-
- Entrée > character filter > tokenizer > token filter > sortie
-
-On retrouve:
-
-+ *character filter*: tâche de nettoyage basique, telle qu'enlever les tags HTML. Il y en a `trois `_ qui sont disponibles par défaut ;
-+ *tokenizer*: découpe le texte en différents *tokens*. `Énormément `_ de *tokenizer* sont disponibles.
-+ *token filter*: altère la liste de *tokens* obtenue pour les "normaliser" en modifiant, supprimant ou rajoutant des *tokens*. Typiquement: enlever certains mots (par exemple les *stopwords* "le", "la", "les" et autres en français), convertir le tout en minuscule, et ainsi de suite. Il en existe également `une pléthore `_.
-
-Ces différents filtres permettent d'éliminer le superflu afin de se concentrer sur l'essentiel : les *tokens* obtenus.
-Par la suite, ES construit une table (un *index inversé*) reliant ces *tokens* aux documents qui les contiennent, qu'il utilise pour la recherche.
-
-Sans entrer dans les détails, l'*analyzer* utilisé par ES pour ZdS :
-
-+ Enlève les tags HTML (en pratique, l'indexation du texte se fait systématiquement sur le contenu converti en HTML et non sur le texte en *markdown*) ;
-+ N'utilise par le *tokenizer* par défaut (découper en *token* après tout caractère non alpha-numérique, en gros) afin de conserver "c++" intact, par exemple ;
-+ Utilise une série de *token filter*s orientés pour comprendre le français, parmi lesquels un *stopper* (pour enlever les prépositions, déterminants, ...) et un *stemmer* (qui se charge, à partir d'un mot, d'en extraire la racine. Par exemple "programmation", "programmer" ou "programmes" seront tout les trois interprétés et indexés de la même manière car ils partagent la même racine).
-
-Les différents *tokens* qui resortent de cette phase d'analyse sont alors indexés, et c'est de ces *tokens* dont ES se servira ensuite pour la recherche, plutôt que de réaliser des recherches *full-text*.
-
-La phase d'indexation est réalisée à l'aide de la commande ``python manage.py es_manager`` (voir ci-dessous).
-
-Phase de recherche
-++++++++++++++++++
-
-Durant la phase de recherche, les documents sont classés par **score**, valeur que ES calcule comme étant le produit ``TF * IDF``, où la partie TF (*term frequencies*) est le nombre de fois qu'un terme apparait dans un document et IDF (*inverse document frequencies*) est la fréquence à laquelle ce terme apparait dans l'ensemble des documents indexés.
-
-C'est en fonction de ce score que seront ensuite classés les résultats, du plus important au plus faible.
-Il est possible de manipuler ce score afin d'obtenir les résultats les plus pertinents possible :
-
-+ *Booster* le champ (à priori) : si le terme recherché est contenu dans un champ donné (par exemple le titre, ou une note de bas de page), le score est multiplié par le facteur de *boost* du champ.
-+ *Booster* le score (à postériori): si le document obtenu possède d'autres propriétés (par exemple, *booster* le score si le *post* trouvé à "aidé l'auteur du sujet").
-+ *Booster* un type de document par rapport à un autre : cas particulier du précédent.
-
-Ces facteurs de *boost* sont modifiables soit directement dans le code de ZdS pour ce qui concerne les facteurs de *boost* sur les champs (voir ci-dessous), soit dans le ``settings.py`` en ce qui concerne les *boosts* à postériori (voir ci-dessous).
-
-
-En pratique
-===========
-
-Configuration
--------------
-
-La configuration de la connexion et de l'*index* se fait dans le ``settings.py``, à l'aide des trois variables suivantes :
-
-.. sourcecode:: python
-
- ES_ENABLED = True
-
- ES_CONNECTIONS = {
- 'default': {
- 'hosts': ['localhost:9200'],
- }
- }
-
- ES_SEARCH_INDEX = {
- 'name': 'zds_search',
- 'shards': 5,
- 'replicas': 0,
- }
-
-
-La première active Elasticsearch pour ZdS.
-La seconde permet de configurer la connexion à Elasticsearch. ``default`` est l'*alias* de la connexion, au cas où il serait nécessaire d'utiliser plusieurs *clusters*.
-La troisième est la configuration de l'*index* avec son nom, son nombre de *shards* et de *replicas*.
-
-Pour modifier les différents paramètres d'une recherche, c'est cette fois dans la variable ``ZDS_APP`` que ça se passe:
-
-.. sourcecode:: python
-
- 'search': {
- 'mark_keywords': ['javafx', 'haskell', 'groovy', 'powershell', 'latex', 'linux', 'windows'],
- 'results_per_page': 20,
- 'search_groups': {
- 'content': (
- _(u'Contenus publiés'), ['publishedcontent', 'chapter']
- ),
- 'topic': (
- _(u'Sujets du forum'), ['topic']
- ),
- 'post': (
- _(u'Messages du forum'), ['post']
- ),
- },
- 'boosts': {
- 'publishedcontent': {
- 'global': 3.0,
- 'if_article': 1.0, # s'il s'agit d'un article
- 'if_tutorial': 1.0, # … d'un tuto
- },
- 'topic': {
- 'global': 2.0,
- 'if_solved': 1.1, # si le sujet est résolu
- 'if_sticky': 1.2, # si le sujet est en post-it
- 'if_locked': 0.1, # si le sujet est fermé
- },
- 'chapter': {
- 'global': 1.5,
- },
- 'post': {
- 'global': 1.0,
- 'if_first': 1.2, # si le post est le premier du topic
- 'if_useful': 1.5, # si le post a été marqué comme étant utile
- 'ld_ratio_above_1': 1.05, # si le ratio pouce vert/rouge est supérieur à 1
- 'ld_ratio_below_1': 0.95, # ... inférieur à 1.
- }
- }
- }
-
-où ``'mark_keywords'`` liste les mots qui ne doivent pas être découpés par le *stemmer* (souvent des noms propres),
-``'results_per_page'`` est le nombre de résultats affichés,
-``'search_groups'`` définit les différents types de documents indexé et la manière dont il sont groupés quand recherchés (sur le formulaire de recherche),
-et ``'boosts'`` les différents facteurs de *boost* appliqués aux différentes situations.
-
-Puisque la phase de *stemming* advient à la fin de l'analyse, tous les mots listés dans ``'mark_keywords'`` doivent être en minuscule et sans éventuels déterminants.
-
-Dans ``'boosts'``, on peut ensuite modifier le comportement de la recherche en choisissant différents facteurs de *boost*.
-Chacune des valeurs multiplie le score (donc l'agrandit si elle est supérieure à 1 et le diminue si elle est inférieure à 1).
-Un *boost global* (dans chacune des variables ``'global'``) est tout d'abord présent et permet de mettre en avant un type de document par rapport à un autre.
-Ensuite, différentes situations peuvent modifier le score.
-
-.. note::
-
- Ces valeurs sont données à titre indicatif et doivent être adaptées à la situation.
-
-.. attention::
-
- Pour que les changements dans ``'mark_keywords'`` soient pris en compte, il est nécessaire de réindexer **tout** le contenu
- (grâce à ``python manage.py es_manager index_all``).
-
-Indexer les données de ZdS
---------------------------
-
-Une fois Elasticsearch `installé <../install/install-es.html>`_ puis configuré et lancé, la commande suivante est utilisée :
-
-.. sourcecode:: bash
-
- python manage.py es_manager
-
-où ```` peut être
-
-+ ``setup`` : crée et configure l'*index* (y compris le *mapping* et l'*analyzer*) dans le *cluster* d'ES ;
-+ ``clear`` : supprime l'*index* du *cluster* d'ES et marque toutes les données comme "à indexer" ;
-+ ``index_flagged`` : indexe les données marquées comme "à indexer" ;
-+ ``index_all`` : invoque ``setup`` puis indexe toute les données (qu'elles soient marquées comme "à indexer" ou non).
-
-
-La commande ``index_flagged`` peut donc être lancée de manière régulière (via un *cron* ou un timer *systemd*) afin d'indexer les nouvelles données ou les données modifiées de manière régulière.
-
-.. note::
-
- Le caractère "à indexer" est fonction des actions effectuées sur l'objet Django (par défaut, à chaque fois que la méthode ``save()`` du modèle est appelée, l'objet est marqué comme "à indexer").
- Cette information est stockée dans la base de donnée MySQL.
-
-Aspects techniques
-==================
-
-Indexation d'un modèle
-----------------------
-
-
-Afin d'être indexable, un modèle Django doit dériver de ``AbstractESDjangoIndexable`` (qui dérive de ``models.Model`` et de ``AbstractESIndexable``). Par exemple,
-
-.. sourcecode:: python
-
- class Post(Comment, AbstractESDjangoIndexable):
- # ...
-
-
-.. note::
-
- Le code est écrit de telle manière à ce que l'id utilisé par ES (champ ``_id``) corresponde à la *pk* du modèle (via la variable ``es_id``).
- Il est donc facile de récupérer un objet dans ES si on en connait la *pk*, à l'aide de ``GET ///``.
-
-Différentes méthodes d'``AbstractESDjangoIndexable`` peuvent ou doivent ensuite être surchargées. Parmi ces dernières,
-
-+ ``get_es_mapping()`` permet de définir le *mapping* d'un document, c'est à dire quels champs seront indexés avec quels types. Par exemple,
-
- .. sourcecode:: python
-
- @classmethod
- def get_es_mapping(cls):
- es_mapping = super(Post, cls).get_es_mapping()
-
- es_mapping.field('text_html', Text())
- es_mapping.field('is_useful', Boolean())
- es_mapping.field('position', Integer())
- # ...
-
- ``Mapping`` est un type de donnée défini par ``elasticsearch_dsl`` (voir à ce sujet `la documentation `_). Si le champ a le même nom qu'une propriété de votre classe, sa valeur sera automatiquement récupérée et indexée. À noter que vous pouvez également marquer une variable comme "à ne pas analyser" avec la variable ``index`` (par exemple, ``Text(index='not_analyzed')``) si vous voulez simplement stocker cette valeur mais ne pas l'utiliser pour effectuer une recherche dessus. On peut également indiquer la valeur du facteur de *boost* avec ``boost`` (par exemple, ``Text(boost=2.0)``).
-
- .. note::
-
- Elasticsearch requiert que deux champs portant le même nom dans le même *index* (même si ils sont issus de types de document différents) aient le même *mapping*.
- Ainsi, tous les champs ``title`` doivent être de type ``Text(boost=1.5)`` et ``tags`` de type ``Keyword(boost=2.0)``.
-
-+ ``get_es_django_indexable()`` permet de définir quels objets doivent être récupérés et indexés. Cette fonction permet également d'utiliser ``prefetch_related()`` ou ``select_related()`` pour éviter les requêtes inutiles. Par exemple,
-
- .. sourcecode:: python
-
- @classmethod
- def get_es_django_indexable(cls, force_reindexing=False):
- q = super(Post, cls).get_es_django_indexable(force_reindexing)\
- .prefetch_related('topic')\
- .prefetch_related('topic__forum')
-
- où ``q`` est un *queryset* Django.
-
-+ ``get_es_document_source()`` permet de gérer des cas où le champ n'est pas directement une propriété de la classe, ou si cette propriété ne peut pas être indexée directement :
-
- .. sourcecode:: python
-
- def get_es_document_source(self, excluded_fields=None):
- excluded_fields = excluded_fields or []
- excluded_fields.extend(
- ['topic_title', 'forum_title', 'forum_pk', 'forum_get_absolute_url'])
-
- data = super(Post, self).get_es_document_source(excluded_fields=excluded_fields)
-
- data['topic_title'] = self.topic.title
- data['forum_pk'] = self.topic.forum.pk
- data['forum_title'] = self.topic.forum.title
- data['forum_get_absolute_url'] = self.topic.forum.get_absolute_url()
-
- return data
-
- Dans cet exemple (issu de la classe ``Post``), on voit que certains champs ne peuvent être directement indexés car ils appartiennent au *topic* et au *forum* parent. Il sont donc exclus du mécanisme par défaut (via la variable ``excluded_fields``), leur valeur est récupérée et définie par après.
-
-
-Finalement, il est important **pour chaque type de document** d'attraper le signal de pré-suppression avec la fonction ``delete_document_in_elasticsearch()``, afin qu'un document supprimé par Django soit également supprimé de Elasticsearch.
-Cela s'effectue comme suit (par exemple pour la classe ``Post``):
-
-.. sourcecode:: python
-
- @receiver(pre_delete, sender=Post)
- def delete_post_in_elasticsearch(sender, instance, **kwargs):
- return delete_document_in_elasticsearch(instance)
-
-Plus d'informations sur les méthodes qui peuvent être surchargées sont disponibles `dans la documentation technique <../back-end-code/searchv2.html>`_.
-
-.. attention::
-
- À chaque fois que vous modifiez le *mapping* d'un document dans ``get_es_mapping()``, tout l'*index* **doit** être reconstruit **et** indexé.
- N'oubliez donc pas de mentionner cette action à lancer manuellement dans le *update.md*.
-
-Le cas particulier des contenus
--------------------------------
-
-La plupart des informations des contenus, en particulier les textes, `ne sont pas indexés dans la base de donnée `_.
-
-Il a été choisi de n'inclure dans Elasticsearch que les chapitres de ces contenus (anciennement, les introductions et conclusions des parties étaient également incluses).
-Ce sont les contenus HTML qui sont indexés et non leur version écrite en *markdown*, afin de rester cohérent avec ce qui se fait pour les *posts*.
-Les avantages de cette décision sont multiples :
-
-+ Le *parsing* est déjà effectué et n'a pas à être refait durant l'indexation ;
-+ Moins de fichiers à lire (pour rappel, les différentes parties d'un contenu `sont rassemblées en un seul fichier `_ à la publication) ;
-+ Pas besoin d'utiliser Git durant le processus d'indexation ;
-
-
-L'indexation des chapitres (représentés par la classe ``FakeChapter``, `voir ici <../back-end-code/tutorialv2.html#zds.tutorialv2.models.database.FakeChapter>`_) est effectuée en même temps que l'indexation des contenus publiés (``PublishedContent``).
-En particulier, c'est la méthode ``get_es_indexable()`` qui est surchargée, profitant du fait que cette méthode peut renvoyer n'importe quel type de document à indexer.
-
-.. sourcecode:: python
-
- @classmethod
- def get_es_indexable(cls, force_reindexing=False):
- """Overridden to also include
- """
-
- index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
- last_pk = 0
- objects_source = super(PublishedContent, cls).get_es_indexable(force_reindexing)
- objects = list(objects_source.filter(pk__gt=last_pk)[:PublishedContent.objects_per_batch])
- while objects:
- chapters = []
-
- for content in objects:
- versioned = content.load_public_version()
-
- if versioned.has_sub_containers(): # chapters are only indexed for middle and big tuto
-
- # delete possible previous chapters
- if content.es_already_indexed:
- index_manager.delete_by_query(
- FakeChapter.get_es_document_type(), ES_Q('match', _routing=content.es_id))
-
- # (re)index the new one(s)
- for chapter in versioned.get_list_of_chapters():
- chapters.append(FakeChapter(chapter, versioned, content.es_id))
- last_pk = objects[-1].pk
- objects = list(objects_source.filter(pk__gt=last_pk)[:PublishedContent.objects_per_batch])
- yield chapters
- yield objects
-
-
-
-Le code tient aussi compte du fait que la classe ``PublishedContent`` `gère le changement de slug `_ afin de maintenir le SEO.
-Ainsi, la méthode ``save()`` est modifiée de manière à supprimer toute référence à elle même et aux chapitres correspondants si un objet correspondant au même contenu mais avec un nouveau slug est créé.
-
-.. note::
-
- Dans ES, une relation de type parent-enfant (`cf. documentation `_) est définie entre les contenus et les chapitres correspondants.
- Cette relation est utilisée pour la suppression, mais il est possible de l'exploiter à d'autres fins.
diff --git a/doc/source/front-end/template-tags.rst b/doc/source/front-end/template-tags.rst
index e51fe356e5..9c9d1d7904 100644
--- a/doc/source/front-end/template-tags.rst
+++ b/doc/source/front-end/template-tags.rst
@@ -105,13 +105,6 @@ sera rendu :
…si le contenu de ``date_epoch`` était de ``122``.
-``from_elasticsearch_date``
----------------------------
-
-Par défaut, Elasticsearch stocke ces dates au format ``yyyy-MM-dd'T'HH:mm:ss.SSSZ``
-(il s'agit du format ``strict_date_time``, voir à ce sujet `la documentation d'Elasticsearch `_).
-Ce filtre transforme cette date en une date que les autres filtres de ce module peuvent exploiter.
-
Le module ``email_obfuscator``
==============================
@@ -572,21 +565,6 @@ Exemple :
{% endfor %}
-Le module ``elasticsearch``
-===========================
-
-``highlight``
-
-Permet de mettre en surbrillance les résultats d'une recherche.
-
-Exemple :
-
-.. sourcecode:: html+django
-
- {% if search_result.text %}
- {% highlight search_result "text" %}
- {% endif %}
-
Le module ``joinby``
====================
diff --git a/doc/source/guides/install-linux.rst b/doc/source/guides/install-linux.rst
index 0c2b71fe56..1daa087a97 100644
--- a/doc/source/guides/install-linux.rst
+++ b/doc/source/guides/install-linux.rst
@@ -77,7 +77,7 @@ Installation complète
L'installation de base suffit grandement pour lancer le site web et découvrir le projet, mais elle est incomplète. Deux outils manquent à l'appel :
- LaTeX, nécessaire pour la génération des PDFs des contenus ;
-- ElasticSearch, nécessaire pour la page de recherche.
+- Typesense, nécessaire pour la page de recherche.
Là encore, une seule commande suffit :
diff --git a/doc/source/images/search/no-connection.png b/doc/source/images/search/no-connection.png
index 050d8c9005..97913d91a1 100644
Binary files a/doc/source/images/search/no-connection.png and b/doc/source/images/search/no-connection.png differ
diff --git a/doc/source/install/extra-install-es.rst b/doc/source/install/extra-install-es.rst
deleted file mode 100644
index dc4db866a5..0000000000
--- a/doc/source/install/extra-install-es.rst
+++ /dev/null
@@ -1,163 +0,0 @@
-=================================================
-Installation de Elasticsearch (pour la recherche)
-=================================================
-
-Zeste de Savoir utilise **Elasticsearch 5**, un moteur de recherche très performant.
-Installer Elasticsearch est nécessaire pour faire fonctionner la recherche.
-
-
-Installation
-============
-
-.. attention::
-
- Par défaut, Elasticsearch requiert au moins 2 Gio de mémoire pour démarrer.
-
- Si vous ne souhaitez pas utiliser autant de mémoire, modifiez le fichier ``config/jvm.option``, en particuliez les options ``-Xms`` et ``-Xmx``.
- Par exemple, vous pouvez passer la valeur de ces variables à 512 Mio grâce à:
-
- .. sourcecode:: none
-
- -Xms512m
- -Xmx512m
-
- Plus d'informations sont disponibles `dans la documentation officielle `_.
-
-Sous Linux
-----------
-
-Installer java 8
-++++++++++++++++
-
-Il est nécessaire d'utiliser au moins **la version 8** de Java pour faire tourner Elasticsearch, mais ce n'est probablement pas la version par défaut de votre système d'exploitation.
-
-**Sous Debian et dérivés**, le package à installer est ``openjdk-8-jdk`` :
-
-+ Sous Ubuntu (et dérivés), s'il n'est pas disponible pour votre système, ajoutez le PPA suivant : ``add-apt-repository ppa:openjdk-r/ppa`` (**en root**).
-+ Sous Debian, il est disponible dans le dépôt ``jessie-backports`` (ajoutez ``deb http://ftp.fr.debian.org/debian jessie-backports main`` dans ``/etc/apt/sources.list``).
-
-Une fois installé, passez sur la version 8 de java à l'aide de ``update-alternatives --config java`` (**en root**).
-
-**Sous Fedora et dérivés** (CentOS, OpenSuse, ...), le paquet à installer est ``java-1.8.0-openjdk``.
-Passez ensuite à la version 8 de java à l'aide de la commande ``alternatives --config java`` (**en root**).
-
-Installer Elasticsearch
-+++++++++++++++++++++++
-
-La procédure d'installation, si vous souhaitez utiliser Elasticsearch sans l'installer via le gestionnaire de paquets de votre distribution, est d'entrer les commandes suivantes dans votre *shell* préféré. Remplacez ``X.Y.Z`` par la version spécifiée dans ``requirements.txt`` !
-
-.. sourcecode:: bash
-
- wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-X.Y.Z.zip
- unzip elasticsearch-X.Y.Z.zip
- cd elasticsearch-X.Y.Z/
-
-Pour démarrer Elasticsearch, utilisez
-
-.. sourcecode:: bash
-
- ./bin/elasticsearch
-
-Vous pouvez arrêter Elasticsearch grâce à CTRL+C.
-
-.. note::
-
- Vous pouvez également installer Elasticsearch comme *daemon* de votre système.
- Rendez-vous `sur la page d'installation d'Elasticsearch `_ pour plus d'informations
-
-Sous macOS
-----------
-
-Utilisez les commandes suivantes pour installer Java 8 et Elasticsearch:
-
-.. sourcecode:: bash
-
- brew update
- brew cask install java
- brew install elasticsearch
-
-
-Pour démarrer Elasticsearch, utilisez la commande suivante:
-
-.. sourcecode:: bash
-
- elasticsearch --config=/usr/local/opt/elasticsearch/config/elasticsearch.yml
-
-.. note::
-
- Vous pouvez également le démarrer comme *daemon*, comme sous Linux.
- Plus d'infos `ici `_.
-
-Sous Windows
-------------
-
-Elasticsearch requiert **la version 8** de Java, que vous pouvez trouver `sur la page officielle de java `_. Prenez la version correspondante à votre système d'exploitation.
-
-Pour télécharger Elasticsearch :
-
-* rendez-vous à l'adresse suivante : `https://www.elastic.co/fr/downloads/past-releases#elasticsearch `_ ;
-* choisissez la version spécifiée dans le fichier `requirements.txt` (désignée ci-après par ``X.Y.Z``);
-* téléchargez la version Windows ;
-* extrayez le dossier ``elasticsearch-X.Y.Z`` du zip à l'aide de votre outil préféré.
-
-Pour démarer Elasticsearch, ouvrez un *shell* (ou un *powershell*) et rendez-vous dans le dossier ``elasticsearch-X.Y.Z``.
-Exécutez ensuite la commande suivante :
-
-.. sourcecode:: bash
-
- bin\elasticsearch
-
-
-Vous pouvez arrêter Elasticsearch grâce à CTRL+C, puis en répondant "o" lorsqu'il vous est demandé ``Terminer le programme de commandes (O/N) ?``.
-
-.. note::
-
- Vous pouvez également le démarrer comme *daemon*, comme sous Linux.
- Plus d'informations `dans la documentation `_.
-
-Indexation et recherche
-=======================
-
-Pour tester que tout fonctionne, quand Elasticsearch est démarré, rendez-vous sur la page `http://localhost:9200/ `_.
-Vous devriez observer une réponse du même genre que celle-ci :
-
-.. sourcecode:: none
-
- {
- "name" : "p0bcxqN",
- "cluster_name" : "elasticsearch",
- "cluster_uuid" : "649S5bMUQOyRzYmQFVPA1A",
- "version" : {
- "number" : "X.Y.Z",
- "build_hash" : "19c13d0",
- "build_date" : "2017-07-18T20:44:24.823Z",
- "build_snapshot" : false,
- "lucene_version" : "6.6.0"
- },
- "tagline" : "You Know, for Search"
- }
-
-
-Si ce n'est pas le cas, vérifiez que vous avez démarré Elasticsearch.
-
-Si c'est le cas, vous pouvez indexer les données à l'aide de la commande ``es_manager``, comme suit :
-
-.. sourcecode:: bash
-
- python manage.py es_manager index_all
-
-Une fois que c'est fait, en vous rendant sur la page de recherche, `http://localhost:8000/rechercher/ `_, vous devriez être capable d'utiliser la recherche.
-En particulier, vous ne devriez pas observer de message d'erreur :
-
-.. figure:: ../images/search/no-connection.png
- :align: center
-
- Si Elasticsearch n'est pas démarré, le message suivant apparait.
-
-Pour réindexer les nouvelles données, utilisez la commande suivante :
-
-.. sourcecode:: bash
-
- python manage.py es_manager index_flagged
-
-Plus d'informations sur la commande ``es_manager`` sont disponibles sur la page `concernant la recherche sur ZdS <../back-end/searchv2.html#indexer-les-donnees-de-zds>`_.
diff --git a/doc/source/install/extra-install-search-engine.rst b/doc/source/install/extra-install-search-engine.rst
new file mode 100644
index 0000000000..1c5205b65c
--- /dev/null
+++ b/doc/source/install/extra-install-search-engine.rst
@@ -0,0 +1,86 @@
+===================================
+Installation du moteur de recherche
+===================================
+
+Zeste de Savoir utilise **Typesense** comme moteur de recherche. L'installer
+est nécessaire pour faire fonctionner la recherche.
+
+
+Installation
+============
+
+La version de Typesense utilisée par ZdS est définie par la variable
+``$ZDS_TYPESENSE_VERSION`` dans ``scripts/define_variable.sh``.
+
+Il est possible d'installer Typesense de plusieurs façons, comme indiqué dans
+`la documentation officielle
+`_ (*Option 2: Local
+Machine / Self-Hosting*).
+
+Depuis le script d'installation de ZdS
+--------------------------------------
+
+Cette méthode fonctionne uniquement sur un système Linux utilisant un processeur avec une architecture amd64.
+
+Exécutez :
+
+.. sourcecode:: bash
+
+ ./scripts/install_zds.sh +typesense-local
+
+Référez-vous à `la documentation correspondante
+<./install-linux.html#composant-typesense-local>`_ pour savoir ce que fait
+cette commande.
+
+Il faut ensuite lancer Typesense avec la commande suivante :
+
+.. sourcecode:: bash
+
+ make run-search-engine
+
+Avec Docker
+-----------
+
+Cette méthode a l'avantage de fonctionner sur n'importe quel système qui dispose de Docker :
+
+.. sourcecode:: bash
+
+ docker run -p 8108:8108 typesense/typesense:$ZDS_TYPESENSE_VERSION --api-key=xyz --data-dir=/tmp
+
+Vérifier le bon lancement de Typesense
+======================================
+
+Pour tester que tout fonctionne, quand Typesense est démarré, rendez-vous sur
+la page `http://localhost:8108/health `_. Vous
+devriez observer une réponse du même genre que celle-ci :
+
+.. sourcecode::
+
+ {"ok":true}
+
+Si ce n'est pas le cas, vérifiez que Typesense est correctement démarré.
+
+Indexation et recherche
+=======================
+
+Une fois que Typesense est installé et démarré, vous pouvez indexer les données
+à l'aide de la commande ``search_engine_manager``, comme suit :
+
+.. sourcecode:: bash
+
+ python manage.py search_engine_manager index_all
+
+Une fois que c'est fait, en vous rendant sur la page de recherche de Zeste de
+Savoir, `http://localhost:8000/rechercher/
+`_, vous devriez pouvoir d'utiliser la
+recherche.
+
+Pour indexer uniquement les nouvelles données, utilisez la commande suivante :
+
+.. sourcecode:: bash
+
+ python manage.py search_engine_manager index_flagged
+
+Plus d'informations sur la commande ``search_engine_manager`` sont disponibles
+sur la page `concernant la recherche sur Zeste de Savoir
+<../back-end/search.html>`_.
diff --git a/doc/source/install/install-linux.rst b/doc/source/install/install-linux.rst
index b6642ad0be..729228638e 100644
--- a/doc/source/install/install-linux.rst
+++ b/doc/source/install/install-linux.rst
@@ -24,7 +24,7 @@ Après avoir cloné le dépôt du code source, installer ZdS sous Linux est rela
make install-linux
-Notez que si vous voulez installer une version complète (avec une version locale `de LaTeX <#composant-tex-local-et-latex-template>`_ et `de Elasticsearch <#composant-elastic-local>`_, plus d'infos ci-dessous), utilisez plutôt
+Notez que si vous voulez installer une version complète (avec une version locale `de LaTeX <#composant-tex-local-et-latex-template>`_ et `de Typesense <#composant-typesense-local>`_, plus d'infos ci-dessous), utilisez plutôt
.. sourcecode:: bash
@@ -72,7 +72,7 @@ Notez que si vous ne souhaitez pas un de ces compsants, vous pouvez utiliser la
Composants ``full``
===================
-Équivalent à ``+base +elastic-local +tex-local +latex-template`` (plus de détails ci-dessous).
+Équivalent à ``+base +typesense-local +tex-local +latex-template`` (plus de détails ci-dessous).
De même que pour `base <#composants-base>`_, vous pouvez agrémenter de ``-composant`` pour ne pas installer un composant donné.
@@ -164,14 +164,13 @@ Strictement équivalent à la commande suivantes:
Plus d'info sur cette fonctionalité `sur la page dédiée <../utils/fixture_loaders.html>`_.
-Composant ``elastic-local``
-===========================
+Composant ``typesense-local``
+=============================
-Installe une version **locale** d'Elasticsearch dans un dossier ``.local`` situé dans le dossier de ZdS.
-La commande ``elasticsearch`` est ensuite ajoutée dans le *virtualenv*, de telle sorte à ce que ce soit cette version locale qui soit utilisée.
-La version d'Elasticsearch installée est controlée par la variable d'environnement ``ZDS_ELASTIC_VERSION`` (voir ``scripts/define_variable.sh`` pour la valeur par défaut).
+Installe une version **locale** de Typesense dans un dossier ``.local`` situé dans le dossier de ZdS.
+La version de Typesense installée est controlée par la variable d'environnement ``ZDS_TYPESENSE_VERSION`` (voir ``scripts/define_variable.sh`` pour la valeur par défaut).
-Notez que vous pouvez choisir d'installer Elasticsearch manuellement, `comme décrit ici <./extra-install-es.html#sous-linux>`_.
+Notez que vous pouvez choisir d'installer Typesense manuellement, `comme décrit ici <./extra-install-search-engine.html#sous-linux>`_.
Composant ``tex-local`` et ``latex-template``
=============================================
diff --git a/doc/source/install/install-macos.rst b/doc/source/install/install-macos.rst
index 07bbaf5bcd..93c762727d 100644
--- a/doc/source/install/install-macos.rst
+++ b/doc/source/install/install-macos.rst
@@ -205,10 +205,8 @@ lequel vous travaillerez avant d’envoyer des *Pull-Requests*.
.. Attention::
La version complète **ne peut être automatiquement installée pour le moment**
- car l’installeur télécharge une version de Java (pour ElasticSearch) spécifique
- à Linux. Cependant, vous pouvez remplacer la version de Java installée dans
- ``zds-site/zdsenv/lib/jdk`` par une version fonctionnant sous macOS et ElasticSearch
- fonctionnera.
+ car l’installeur télécharge une version de Typesense (le moteur de recherche) spécifique
+ à Linux.
Le système de génération et d’export des PDF devrait fonctionner normalement.
@@ -216,7 +214,7 @@ lequel vous travaillerez avant d’envoyer des *Pull-Requests*.
.. seealso::
- - `Installation d’Elasticsearch `_ ;
+ - `Installation de Typesense `_ ;
- `installation de LaTeX `_.
Pour installer la version minimale, exécutez depuis la racine du dépôt que vous
@@ -261,9 +259,7 @@ Si vous voulez la version complète :
Si vous installez la version complète, le script va, en plus :
- - installer une version de Java pour Linux dans le dossier ``zds-site/zdsenv/lib/jdk``
- et modifier l’environnement virtuel pour que cette version de Java soit utilisée ;
- - installer ElasticSearch dans le dossier ``zds-site/.local/elasticsearch`` ;
+ - installer Typesense dans le dossier ``zds-site/.local/typesense`` ;
- installer TeXLive (permettant de compiler du LaTeX en PDF, utilisé pour les
exports PDF) dans le dossier ``zds-site/.local/texlive`` ;
- cloner le dépôt contenant le modèle LaTeX utilisé par l’export PDF dans le
diff --git a/doc/source/install/install-windows.rst b/doc/source/install/install-windows.rst
index b957d11bd2..8a47312189 100644
--- a/doc/source/install/install-windows.rst
+++ b/doc/source/install/install-windows.rst
@@ -140,9 +140,9 @@ On peut finalement lancer ZdS:
Aller plus loin
===============
-Pour faire fonctionner ZdS dans son ensemble vous devez installer les outils LateX et Elasticsearch:
+Pour faire fonctionner ZdS dans son ensemble vous devez installer les outils LateX et Typesense :
-- `Installez Elasticsearch `_ ;
+- `Installez Typesense `_ ;
- `Installez LaTeX `_.
Vous pouvez également `indiquer à Git de ne pas effectuer de commit s'il y a des erreurs de formatage dans le code <../utils/git-pre-hook.html>`__.
diff --git a/requirements.txt b/requirements.txt
index 2c28ad1de3..12e87843ca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,5 @@
# Implicit dependencies (optional dependencies of dependencies)
crispy-forms-bootstrap2==2024.1
-elasticsearch-dsl==5.4.0
-elasticsearch==5.5.3
social-auth-app-django==5.4.0
# Explicit dependencies (references in code)
@@ -19,6 +17,7 @@ lxml==5.1.0
Pillow==10.2.0
pymemcache==4.0.0
requests==2.31.0
+typesense==0.19.0
ua-parser==0.18.0
# Api dependencies
diff --git a/scripts/define_variable.sh b/scripts/define_variable.sh
old mode 100755
new mode 100644
index 152f813a1b..6286b37c35
--- a/scripts/define_variable.sh
+++ b/scripts/define_variable.sh
@@ -14,8 +14,12 @@ if [[ $ZDS_NVM_VERSION == "" ]]; then
ZDS_NVM_VERSION="0.39.5"
fi
-if [[ $ZDS_ELASTIC_VERSION == "" ]]; then
- ZDS_ELASTIC_VERSION="5.5.2"
+if [[ $ZDS_TYPESENSE_VERSION == "" ]]; then
+ ZDS_TYPESENSE_VERSION="27.0" # needs to be also updated in .github/workflows/ci.yml
+fi
+
+if [[ $ZDS_TYPESENSE_API_KEY == "" ]]; then
+ ZDS_TYPESENSE_API_KEY="xyz"
fi
if [[ $ZDS_LATEX_REPO == "" ]]; then
diff --git a/scripts/install_zds.sh b/scripts/install_zds.sh
index a9e45065fc..a778ff8e18 100755
--- a/scripts/install_zds.sh
+++ b/scripts/install_zds.sh
@@ -242,86 +242,36 @@ fi
export ZDS_ENV=$(realpath $ZDS_VENV)
-# local jdk
-if ! $(_in "-jdk-local" $@) && ( $(_in "+jdk-local" $@) || $(_in "+full" $@) ); then
- print_info "* [+jdk-local] installing a local version of JDK (v$ZDS_JDK_VERSION)" --bold
-
- mkdir -p $ZDS_VENV/lib/
- cd $ZDS_VENV/lib/
-
- jdk_path=$(realpath jdk)
-
- if [ -d "$jdk_path" ]; then # remove previous install
- rm -rf "$jdk_path"
- fi
-
- baseURL="https://github.com/adoptium/temurin11-binaries/releases/download/"
- foldername="jdk-${ZDS_JDK_VERSION}+${ZDS_JDK_REV}"
- folderPATH="${foldername}/OpenJDK11U-jdk_x64_linux_hotspot_${ZDS_JDK_VERSION}_${ZDS_JDK_REV}.tar.gz"
-
- echo "GET ${baseURL}${folderPATH}"
- wget -O ${foldername}.tar.gz ${baseURL}${folderPATH} -q --show-progress
- tar xf ${foldername}.tar.gz
-
- if [[ $? == 0 ]]; then
- rm ${foldername}.tar.gz
- mv ${foldername} "$jdk_path"
-
- echo $($jdk_path/bin/java -version)
-
- export PATH="$PATH:$jdk_path/bin"
- export JAVA_HOME="$jdk_path"
- export ES_JAVA_OPTS="-Xms512m -Xmx512m"
-
- if [[ $(grep -c -i "export JAVA_HOME" $ZDS_ENV/bin/activate) == "0" ]]; then # add java to venv activate's
- ACTIVATE_JAVA="export PATH=\"\$PATH:$jdk_path/bin\"\nexport JAVA_HOME=\"$jdk_path\"\nexport ES_JAVA_OPTS=\"-Xms512m -Xmx512m\""
-
- echo -e $ACTIVATE_JAVA >> $ZDS_ENV/bin/activate
- echo -e $ACTIVATE_JAVA >> $ZDS_ENV/bin/activate.csh
- echo -e $ACTIVATE_JAVA >> $ZDS_ENV/bin/activate.fish
- fi
- else
- print_error "!! Cannot get or extract jdk ${ZDS_JDK_VERSION}"
- exit 1
- fi
- cd $ZDSSITE_DIR
-fi
-
-
-# local elasticsearch
-if ! $(_in "-elastic-local" $@) && ( $(_in "+elastic-local" $@) || $(_in "+full" $@) ); then
- print_info "* [+elastic-local] installing a local version of elasticsearch (v$ZDS_ELASTIC_VERSION)" --bold
+# local Typesense
+if ! $(_in "-typesense-local" $@) && ( $(_in "+typesense-local" $@) || $(_in "+full" $@) ); then
+ print_info "* [+typesense-local] installing a local version of typesense (v$ZDS_TYPESENSE_VERSION)" --bold
mkdir -p .local
cd .local
- es_path=$(realpath elasticsearch)
+ readonly typesense_path=$(realpath typesense)
- if [ -d "$es_path" ]; then # remove previous install
- rm -r "$es_path"
+ if [ -d "$typesense_path" ]; then # remove previous install
+ rm -r "$typesense_path"
fi
- wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ZDS_ELASTIC_VERSION}.zip -q --show-progress
- if [[ $? == 0 ]]; then
- unzip -q elasticsearch-${ZDS_ELASTIC_VERSION}.zip
- rm elasticsearch-${ZDS_ELASTIC_VERSION}.zip
- mv elasticsearch-${ZDS_ELASTIC_VERSION} elasticsearch
+ mkdir $typesense_path
+ cd $typesense_path
- # add options to reduce memory consumption
- print_info "#Options added by install_zds.sh" >> "$es_path/config/jvm.options"
- print_info "-Xms512m" >> "$es_path/config/jvm.options"
- print_info "-Xmx512m" >> "$es_path/config/jvm.options"
+ readonly archive_name=typesense-server-$ZDS_TYPESENSE_VERSION-linux-amd64.tar.gz
- # symbolic link to elastic start script
- ln -s "$es_path/bin/elasticsearch" $ZDS_ENV/bin/
+ wget -q https://dl.typesense.org/releases/$ZDS_TYPESENSE_VERSION/$archive_name --show-progress
+ if [[ $? == 0 ]]; then
+ tar -xzf $archive_name
+ rm $archive_name
+ mkdir typesense-data
else
- print_error "!! Cannot get elasticsearch ${ZDS_ELASTIC_VERSION}"
+ print_error "!! Cannot get typesense ${ZDS_TYPESENSE_VERSION}"
exit 1
fi
cd $ZDSSITE_DIR
fi
-
# local texlive
if ! $(_in "-tex-local" $@) && ( $(_in "+tex-local" $@) || $(_in "+full" $@) ); then
print_info "* [+tex-local] install texlive" --bold
diff --git a/templates/searchv2/includes/chapter.part.html b/templates/search/includes/chapter.part.html
similarity index 66%
rename from templates/searchv2/includes/chapter.part.html
rename to templates/search/includes/chapter.part.html
index 6c6647df54..e25d6d6f56 100644
--- a/templates/searchv2/includes/chapter.part.html
+++ b/templates/search/includes/chapter.part.html
@@ -1,9 +1,7 @@
-{% load emarkdown %}
{% load i18n %}
{% load date %}
-{% load elasticsearch %}
-
+
{% if search_result.thumbnail %}
{% endif %}
@@ -17,9 +15,13 @@
+ {% if has_query %}
+ {% include "misc/paginator.html" with position="top" %}
+
+ {% if object_list %}
+
+ {% for result in object_list %}
+
+ {% if result.collection == 'chapter' %}
+ {% include "search/includes/chapter.part.html" with search_result=result.document %}
+ {% elif result.collection == 'publishedcontent' %}
+ {% include "search/includes/publishedcontent.part.html" with search_result=result.document %}
+ {% elif result.collection == 'topic' %}
+ {% include "search/includes/topic.part.html" with search_result=result.document %}
+ {% elif result.collection == 'post' %}
+ {% include "search/includes/post.part.html" with search_result=result.document %}
+ {% endif %}
+
+ {% endfor %}
+
+
+ {% if paginator.num_pages > 1 and not page_obj.has_next and has_more_results %}
+
+ {% trans "Vous êtes sur la dernière page des résultats. Il y a plus de résultats, mais nous vous conseillons plutôt d'affiner les termes de votre recherche." %}
+
+ {% endif %}
+ {% else %}
+
+ {% trans "Aucun résultat trouvé." %}
+
+ {% endif %}
+
+
+ {% include "misc/paginator.html" with position="bottom" %}
+ {% endif %}
+
- {% if query %}
- {% include "misc/paginator.html" with position="top" %}
-
- {% if object_list %}
-
- {% for result in object_list %}
-
- {% if result.meta.doc_type == 'chapter' %}
- {% include "searchv2/includes/chapter.part.html" with search_result=result %}
- {% elif result.meta.doc_type == 'publishedcontent' %}
- {% include "searchv2/includes/publishedcontent.part.html" with search_result=result %}
- {% elif result.meta.doc_type == 'topic' %}
- {% include "searchv2/includes/topic.part.html" with search_result=result %}
- {% elif result.meta.doc_type == 'post' %}
- {% include "searchv2/includes/post.part.html" with search_result=result %}
- {% else %}
- {% trans "type d’objet inconnu" %} : {{ result.meta.doc_type }}
- {% endif %}
-
- {% empty %}
-
- {% trans "Aucun résultat trouvé." %}
-
- {% endfor %}
-
- {% else %}
-
- {% trans "Aucun résultat trouvé." %}
-
- {% endif %}
-
-
- {% include "misc/paginator.html" with position="bottom" %}
- {% endif %}
-
-{% endblock %}
diff --git a/zds/forum/commons.py b/zds/forum/commons.py
index eea9352002..83ed110f78 100644
--- a/zds/forum/commons.py
+++ b/zds/forum/commons.py
@@ -71,8 +71,6 @@ def perform_move(self):
raise Http404("Forum not found", e)
forum = get_object_or_404(Forum, pk=forum_pk)
self.object.forum = forum
-
- # Save topic to update update_index_date
self.object.save()
signals.topic_moved.send(sender=self.object.__class__, topic=self.object)
diff --git a/zds/forum/managers.py b/zds/forum/managers.py
index 3786ccee02..9cf673207a 100644
--- a/zds/forum/managers.py
+++ b/zds/forum/managers.py
@@ -42,6 +42,22 @@ def get_private_forums_of_category(self, category, user):
.all()
)
+ def get_authorized_forums_pk(self, user):
+ """
+ Find forums the user is allowed to visit.
+
+ :param user: concerned user.
+ :return: pk of authorized forums
+ """
+ forums_pub = self.filter(groups__isnull=True).all()
+ if user and user.is_authenticated:
+ forums_private = self.filter(groups__isnull=False, groups__in=user.groups.all()).all()
+ list_forums = list(forums_pub | forums_private)
+ else:
+ list_forums = list(forums_pub)
+
+ return [f.pk for f in list_forums]
+
class TopicManager(models.Manager):
"""
diff --git a/zds/forum/migrations/0024_rename_search_fields.py b/zds/forum/migrations/0024_rename_search_fields.py
new file mode 100644
index 0000000000..b53f7a2622
--- /dev/null
+++ b/zds/forum/migrations/0024_rename_search_fields.py
@@ -0,0 +1,43 @@
+# Generated by Django 4.2.11 on 2024-04-15 22:53
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("forum", "0023_allow_blank_solved_by_topic_field"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="post",
+ name="es_already_indexed",
+ ),
+ migrations.RemoveField(
+ model_name="post",
+ name="es_flagged",
+ ),
+ migrations.RemoveField(
+ model_name="topic",
+ name="es_already_indexed",
+ ),
+ migrations.RemoveField(
+ model_name="topic",
+ name="es_flagged",
+ ),
+ migrations.AddField(
+ model_name="post",
+ name="search_engine_requires_index",
+ field=models.BooleanField(
+ db_index=True, default=True, verbose_name="Doit être (ré)indexé par le moteur de recherche"
+ ),
+ ),
+ migrations.AddField(
+ model_name="topic",
+ name="search_engine_requires_index",
+ field=models.BooleanField(
+ db_index=True, default=True, verbose_name="Doit être (ré)indexé par le moteur de recherche"
+ ),
+ ),
+ ]
diff --git a/zds/forum/models.py b/zds/forum/models.py
index 4592b60075..7250513c97 100644
--- a/zds/forum/models.py
+++ b/zds/forum/models.py
@@ -7,21 +7,26 @@
from django.urls import reverse
from django.db import models
from django.dispatch import receiver
-from django.db.models.signals import pre_delete
+from django.db.models.signals import pre_delete, post_save
-from elasticsearch_dsl.field import Text, Keyword, Integer, Boolean, Float, Date
-
-from zds.forum.managers import TopicManager, ForumManager, PostManager, TopicReadManager
from zds.forum import signals
-from zds.searchv2.models import AbstractESDjangoIndexable, delete_document_in_elasticsearch, ESIndexManager
+from zds.forum.managers import TopicManager, ForumManager, PostManager, TopicReadManager
+from zds.search.models import AbstractSearchIndexableModel
+from zds.search.utils import (
+ SearchFilter,
+ SearchIndexManager,
+ date_to_timestamp_int,
+ clean_html,
+)
from zds.utils import get_current_user, old_slugify
from zds.utils.models import Comment, Tag
-def sub_tag(tag):
- start = tag.group("start")
- end = tag.group("end")
- return f"{start + end}"
+def get_search_filter_authorized_forums(user: User) -> SearchFilter:
+ filter_by = SearchFilter()
+ filter_by.add_exact_filter("forum_pk", Forum.objects.get_authorized_forums_pk(user))
+
+ return filter_by
class ForumCategory(models.Model):
@@ -169,7 +174,7 @@ def has_group(self):
return self._nb_group > 0
-class Topic(AbstractESDjangoIndexable):
+class Topic(AbstractSearchIndexableModel):
"""
A Topic is a thread of posts.
A topic has several states, witch are all independent:
@@ -178,7 +183,7 @@ class Topic(AbstractESDjangoIndexable):
- Sticky: sticky topics are displayed on top of topic lists (ex: on forum page).
"""
- objects_per_batch = 1000
+ initial_search_index_batch_size = 256
class Meta:
verbose_name = "Sujet"
@@ -448,46 +453,61 @@ def is_read_by_user(self, user=None, check_auth=True):
return TopicRead.objects.is_topic_last_message_read(self, user, check_auth)
@classmethod
- def get_es_mapping(cls):
- es_mapping = super().get_es_mapping()
-
- es_mapping.field("title", Text(boost=1.5))
- es_mapping.field("tags", Text(boost=2.0))
- es_mapping.field("subtitle", Text())
- es_mapping.field("is_solved", Boolean())
- es_mapping.field("is_locked", Boolean())
- es_mapping.field("is_sticky", Boolean())
- es_mapping.field("pubdate", Date())
- es_mapping.field("forum_pk", Integer())
-
- # not indexed:
- es_mapping.field("get_absolute_url", Keyword(index=False))
- es_mapping.field("forum_title", Text(index=False))
- es_mapping.field("forum_get_absolute_url", Keyword(index=False))
-
- return es_mapping
+ def get_search_document_schema(cls):
+ search_engine_schema = super().get_search_document_schema()
+
+ search_engine_schema["fields"] = [
+ {"name": "forum_pk", "type": "int32", "facet": False}, # we can filter on it
+ {"name": "title", "type": "string"}, # we search on it
+ {"name": "subtitle", "type": "string", "optional": True}, # we search on it
+ {"name": "forum_title", "type": "string", "index": False},
+ {"name": "tags", "type": "string[]", "facet": True}, # we search on it
+ {"name": "tag_slugs", "type": "string[]", "optional": True, "index": False},
+ {"name": "pubdate", "type": "int64", "index": False},
+ {"name": "get_absolute_url", "type": "string", "index": False},
+ {"name": "forum_get_absolute_url", "type": "string", "index": False},
+ {"name": "weight", "type": "float"}, # we sort on it
+ ]
+
+ return search_engine_schema
@classmethod
- def get_es_django_indexable(cls, force_reindexing=False):
+ def get_indexable_objects(cls, force_reindexing=False):
"""Overridden to prefetch tags and forum"""
- query = super().get_es_django_indexable(force_reindexing)
+ query = super().get_indexable_objects(force_reindexing)
return query.prefetch_related("tags").select_related("forum")
- def get_es_document_source(self, excluded_fields=None):
- """Overridden to handle the case of tags (M2M field)"""
+ def get_document_source(self, excluded_fields=[]):
+ excluded_fields.extend(["tags", "pubdate"])
- excluded_fields = excluded_fields or []
- excluded_fields.extend(["tags", "forum_pk", "forum_title", "forum_get_absolute_url"])
-
- data = super().get_es_document_source(excluded_fields=excluded_fields)
- data["tags"] = [tag.title for tag in self.tags.all()]
+ data = super().get_document_source(excluded_fields=excluded_fields)
+ data["tags"] = []
+ data["tag_slugs"] = []
+ for tag in self.tags.all():
+ data["tags"].append(tag.title)
+ data["tag_slugs"].append(tag.slug) # store also slugs to have them from search results
data["forum_pk"] = self.forum.pk
data["forum_title"] = self.forum.title
data["forum_get_absolute_url"] = self.forum.get_absolute_url()
+ data["pubdate"] = date_to_timestamp_int(self.pubdate)
+ data["weight"] = self._compute_search_weight()
return data
+ @classmethod
+ def get_search_query(cls, user):
+ return {
+ "query_by": "title,subtitle,tags",
+ "query_by_weights": "{},{},{}".format(
+ settings.ZDS_APP["search"]["boosts"]["topic"]["title"],
+ settings.ZDS_APP["search"]["boosts"]["topic"]["subtitle"],
+ settings.ZDS_APP["search"]["boosts"]["topic"]["tags"],
+ ),
+ "filter_by": str(get_search_filter_authorized_forums(user)),
+ "sort_by": "weight:desc",
+ }
+
def save(self, *args, **kwargs):
"""Overridden to handle the displacement of the topic to another forum"""
@@ -496,25 +516,70 @@ def save(self, *args, **kwargs):
except Topic.DoesNotExist:
pass
else:
- if old_self.forum.pk != self.forum.pk or old_self.title != self.title:
- Post.objects.filter(topic__pk=self.pk).update(es_flagged=True)
+ is_moved = old_self.forum.pk != self.forum.pk
+ posts = Post.objects.filter(topic__pk=self.pk)
+ if is_moved or old_self.title != self.title:
+ posts.update(search_engine_requires_index=True)
+
+ if is_moved and self.forum.groups is not None:
+ # Moved to a restricted forum, we remove it from the search
+ # engine, will be correctly reindexed later (this approach is
+ # simpler than updating everything in the search engine)
+ search_engine_manager = SearchIndexManager()
+
+ filter_by = SearchFilter()
+ filter_by.add_exact_filter("topic_pk", [self.pk])
+
+ search_engine_manager.delete_by_query(Post.get_search_document_type(), {"filter_by": str(filter_by)})
+ search_engine_manager.delete_document(self)
+
return super().save(*args, **kwargs)
+ def _compute_search_weight(self):
+ """
+ This function calculates a weight for topics in order to sort them according to different boosts.
+ There is a boost according to the state of the topic:
+ - Solved: it was a question, and this question has been answered. The "solved" state is set at author's discretion.
+ - Locked: nobody can write on a locked topic.
+ - Sticky: sticky topics are displayed on top of topic lists (ex: on forum page).
+ """
+ weight_solved = settings.ZDS_APP["search"]["boosts"]["topic"]["if_solved"]
+ weight_sticky = settings.ZDS_APP["search"]["boosts"]["topic"]["if_sticky"]
+ weight_locked = settings.ZDS_APP["search"]["boosts"]["topic"]["if_locked"]
+ weight_global = settings.ZDS_APP["search"]["boosts"]["topic"]["global"]
+ # if the topic isn't in one of this states (solved, locked, sticky), it needs a weight, it's the global weight
+ is_global = 0 if self.is_solved or self.is_sticky or self.is_locked else 1
+ return max(
+ weight_solved * self.is_solved,
+ weight_sticky * self.is_sticky,
+ weight_locked * self.is_locked,
+ is_global * weight_global,
+ )
-@receiver(pre_delete, sender=Topic)
-def delete_topic_in_elasticsearch(sender, instance, **kwargs):
- """catch the pre_delete signal to ensure the deletion in ES"""
- return delete_document_in_elasticsearch(instance)
+@receiver(post_save, sender=Tag)
+def topic_tags_changed(instance, created, **kwargs):
+ if not created:
+ # It is an update of an existing object
+ Topic.objects.filter(tags=instance.pk).update(search_engine_requires_index=True)
-class Post(Comment, AbstractESDjangoIndexable):
+
+@receiver(post_save, sender=Forum)
+def forum_title_changed(instance, created, **kwargs):
+ if not created:
+ # It is an update of an existing object
+ Topic.objects.filter(forum=instance.pk).update(search_engine_requires_index=True)
+ Post.objects.filter(topic__forum=instance.pk).update(search_engine_requires_index=True)
+
+
+class Post(Comment, AbstractSearchIndexableModel):
"""
A forum post written by a user.
A post can be marked as useful: topic's author (or admin) can declare any topic as "useful", and this post is
displayed as is on front.
"""
- objects_per_batch = 2000
+ initial_search_index_batch_size = 512
topic = models.ForeignKey(Topic, verbose_name="Sujet", db_index=True, on_delete=models.CASCADE)
@@ -536,70 +601,106 @@ def get_notification_title(self):
return self.topic.title
@classmethod
- def get_es_mapping(cls):
- es_mapping = super().get_es_mapping()
-
- es_mapping.field("text_html", Text())
- es_mapping.field("is_useful", Boolean())
- es_mapping.field("is_visible", Boolean())
- es_mapping.field("position", Integer())
- es_mapping.field("like_dislike_ratio", Float())
- es_mapping.field("pubdate", Date())
- es_mapping.field("forum_pk", Integer())
- es_mapping.field("topic_pk", Integer())
-
- # not indexed:
- es_mapping.field("get_absolute_url", Keyword(index=False))
- es_mapping.field("topic_title", Text(index=False))
- es_mapping.field("forum_title", Text(index=False))
- es_mapping.field("forum_get_absolute_url", Keyword(index=False))
-
- return es_mapping
+ def get_search_document_schema(cls):
+ search_engine_schema = super().get_search_document_schema()
+
+ search_engine_schema["fields"] = [
+ {"name": "topic_pk", "type": "int64"}, # we filter on it when a topic is moved
+ {"name": "forum_pk", "type": "int64"}, # we can filter on it
+ {"name": "topic_title", "type": "string", "index": False},
+ {"name": "forum_title", "type": "string", "index": False},
+ {"name": "text", "type": "string"}, # we search on it
+ {"name": "pubdate", "type": "int64", "index": False},
+ {"name": "get_absolute_url", "type": "string", "index": False},
+ {"name": "forum_get_absolute_url", "type": "string", "index": False},
+ {"name": "weight", "type": "float", "facet": False}, # we sort on it
+ ]
+
+ return search_engine_schema
@classmethod
- def get_es_django_indexable(cls, force_reindexing=False):
+ def get_indexable_objects(cls, force_reindexing=False):
"""Overridden to prefetch stuffs"""
- q = super().get_es_django_indexable(force_reindexing).prefetch_related("topic").prefetch_related("topic__forum")
+ q = (
+ super()
+ .get_indexable_objects(force_reindexing)
+ .filter(is_visible=True)
+ .prefetch_related("topic")
+ .prefetch_related("topic__forum")
+ )
return q
- def get_es_document_source(self, excluded_fields=None):
+ def get_document_source(self, excluded_fields=[]):
"""Overridden to handle the information of the topic"""
- excluded_fields = excluded_fields or []
- excluded_fields.extend(
- ["like_dislike_ratio", "topic_title", "topic_pk", "forum_title", "forum_pk", "forum_get_absolute_url"]
- )
-
- data = super().get_es_document_source(excluded_fields=excluded_fields)
-
- data["like_dislike_ratio"] = (
- (self.like / self.dislike) if self.dislike != 0 else self.like if self.like != 0 else 1
- )
+ excluded_fields.extend(["pubdate", "text"])
+ data = super().get_document_source(excluded_fields=excluded_fields)
data["topic_pk"] = self.topic.pk
data["topic_title"] = self.topic.title
-
data["forum_pk"] = self.topic.forum.pk
data["forum_title"] = self.topic.forum.title
data["forum_get_absolute_url"] = self.topic.forum.get_absolute_url()
+ data["pubdate"] = date_to_timestamp_int(self.pubdate)
+ data["text"] = clean_html(self.text_html)
+ data["weight"] = self._compute_search_weight()
return data
def hide_comment_by_user(self, user, text_hidden):
- """Overridden to directly hide the post in ES as well"""
+ """Overridden to directly delete the post in the search engine as well"""
super().hide_comment_by_user(user, text_hidden)
- index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
- index_manager.update_single_document(self, {"is_visible": False})
+ search_engine_manager = SearchIndexManager()
+ search_engine_manager.delete_document(self)
+ def _compute_search_weight(self):
+ """
+ This function calculates a weight for post in order to sort them according to different boosts.
+ There is a boost according to the position, the usefulness and the ration of likes.
+ """
+ weight_first = settings.ZDS_APP["search"]["boosts"]["post"]["if_first"]
+ weight_useful = settings.ZDS_APP["search"]["boosts"]["post"]["if_useful"]
+ weight_ld_ratio_above_1 = settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_above_1"]
+ weight_ld_ratio_below_1 = settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_below_1"]
+ weight_global = settings.ZDS_APP["search"]["boosts"]["post"]["global"]
+
+ like_dislike_ratio = (self.like / self.dislike) if self.dislike != 0 else self.like if self.like != 0 else 1
+ is_ratio_above_1 = 1 if like_dislike_ratio >= 1 else 0
+ is_ratio_below_1 = 1 - is_ratio_above_1
+
+ is_first = 1 if self.position == 1 else 0
+
+ return max(
+ weight_first * is_first,
+ weight_useful * self.is_useful,
+ weight_ld_ratio_above_1 * is_ratio_above_1,
+ weight_ld_ratio_below_1 * is_ratio_below_1,
+ weight_global,
+ )
+
+ @classmethod
+ def get_search_query(cls, user):
+ filter_by = get_search_filter_authorized_forums(user)
+
+ return {
+ "query_by": "text",
+ "query_by_weights": "{}".format(
+ settings.ZDS_APP["search"]["boosts"]["post"]["text"],
+ ),
+ "filter_by": str(filter_by),
+ "sort_by": "weight:desc",
+ }
+
+@receiver(pre_delete, sender=Topic)
@receiver(pre_delete, sender=Post)
-def delete_post_in_elasticsearch(sender, instance, **kwargs):
- """catch the pre_delete signal to ensure the deletion in ES"""
- return delete_document_in_elasticsearch(instance)
+def delete_in_search(sender, instance, **kwargs):
+ """catch the pre_delete signal to ensure the deletion in the search engine"""
+ SearchIndexManager().delete_document(instance)
class TopicRead(models.Model):
diff --git a/zds/forum/tests/tests.py b/zds/forum/tests/tests.py
index 22ea7bfcc7..f67d27e7db 100644
--- a/zds/forum/tests/tests.py
+++ b/zds/forum/tests/tests.py
@@ -7,7 +7,14 @@
from django.test import TestCase
from zds.forum.commons import PostEditMixin
-from zds.forum.tests.factories import ForumCategoryFactory, ForumFactory, TopicFactory, PostFactory, TagFactory
+from zds.forum.tests.factories import (
+ ForumCategoryFactory,
+ ForumFactory,
+ TopicFactory,
+ PostFactory,
+ TagFactory,
+ create_category_and_forum,
+)
from zds.forum.models import Forum, TopicRead, Post, Topic
from zds.member.tests.factories import ProfileFactory, StaffProfileFactory
from zds.notification.models import TopicAnswerSubscription
@@ -1204,6 +1211,22 @@ def test_is_read(self):
self.assertTrue(topic.is_read_by_user(self.staff.user, check_auth=False))
self.assertFalse(topic.is_read_by_user(reader.user, check_auth=False))
+ def test_get_authorized_forums_pk(self):
+ user = ProfileFactory().user
+ hidden_forum = self.forum3
+
+ # Regular user can access only the public forum:
+ self.assertEqual(sorted(Forum.objects.get_authorized_forums_pk(user)), sorted([self.forum1.pk, self.forum2.pk]))
+
+ # Staff user can access all forums:
+ self.assertEqual(
+ sorted(Forum.objects.get_authorized_forums_pk(self.staff.user)),
+ sorted([self.forum1.pk, self.forum2.pk, hidden_forum.pk]),
+ )
+
+ # By default, only public forums are available:
+ self.assertEqual(sorted(Forum.objects.get_authorized_forums_pk(None)), sorted([self.forum1.pk, self.forum2.pk]))
+
class TopicReadAndUnreadTests(TestCase):
def setUp(self):
diff --git a/zds/forum/tests/tests_utils.py b/zds/forum/tests/tests_utils.py
deleted file mode 100644
index f0f03bf128..0000000000
--- a/zds/forum/tests/tests_utils.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from django.contrib.auth.models import Group
-from django.test import TestCase
-
-from zds.forum.tests.factories import create_category_and_forum
-from zds.forum.utils import get_authorized_forums_pk
-from zds.member.tests.factories import ProfileFactory, StaffProfileFactory
-
-
-class GetAuthorizedForumsTests(TestCase):
- def test_get_authorized_forums_pk(self):
- user = ProfileFactory().user
- staff = StaffProfileFactory().user
-
- # 1. Create a hidden forum belonging to a hidden staff group:
- group = Group.objects.create(name="Les illuminatis anonymes de ZdS")
- _, hidden_forum = create_category_and_forum(group)
-
- staff.groups.add(group)
- staff.save()
-
- # 2. Create a public forum:
- _, public_forum = create_category_and_forum()
-
- # 3. Regular user can access only the public forum:
- self.assertEqual(get_authorized_forums_pk(user), [public_forum.pk])
-
- # 4. Staff user can access all forums:
- self.assertEqual(sorted(get_authorized_forums_pk(staff)), sorted([hidden_forum.pk, public_forum.pk]))
-
- # 5. By default, only public forums are available:
- self.assertEqual(get_authorized_forums_pk(None), [public_forum.pk])
diff --git a/zds/forum/utils.py b/zds/forum/utils.py
index 4a637d2b58..bc279f13cb 100644
--- a/zds/forum/utils.py
+++ b/zds/forum/utils.py
@@ -198,20 +198,3 @@ def post(self, request, *args, **kwargs):
def create_forum(self, form_class, **kwargs):
raise NotImplementedError("`create_forum()` must be implemented.")
-
-
-def get_authorized_forums_pk(user):
- """
- Find forums the user is allowed to visit.
-
- :param user: concerned user.
- :return: pk of authorized forums
- """
- forums_pub = Forum.objects.filter(groups__isnull=True).all()
- if user and user.is_authenticated:
- forums_private = Forum.objects.filter(groups__isnull=False, groups__in=user.groups.all()).all()
- list_forums = list(forums_pub | forums_private)
- else:
- list_forums = list(forums_pub)
-
- return [f.pk for f in list_forums]
diff --git a/zds/pages/views.py b/zds/pages/views.py
index b89f5d2eb2..89d33cc8b0 100644
--- a/zds/pages/views.py
+++ b/zds/pages/views.py
@@ -17,7 +17,7 @@
from zds.forum.models import Topic
from zds.member.decorator import can_write_and_read_now
from zds.pages.models import GroupContact
-from zds.searchv2.forms import SearchForm
+from zds.search.forms import SearchForm
from zds.tutorialv2.models.database import PublishableContent, PublishedContent
from zds.utils.context_processor import get_repository_url
from zds.utils.models import Alert, CommentEdit, Comment
diff --git a/zds/searchv2/management/__init__.py b/zds/search/__init__.py
similarity index 100%
rename from zds/searchv2/management/__init__.py
rename to zds/search/__init__.py
diff --git a/zds/searchv2/forms.py b/zds/search/forms.py
similarity index 68%
rename from zds/searchv2/forms.py
rename to zds/search/forms.py
index 5565bf1ec5..9e3e994a6e 100644
--- a/zds/searchv2/forms.py
+++ b/zds/search/forms.py
@@ -18,21 +18,16 @@ class SearchForm(forms.Form):
widget=forms.TextInput(attrs={"type": "search", "required": "required", "id": "search-home"}),
)
- choices = sorted(
+ model_choices = sorted(
((k, v[0]) for k, v in settings.ZDS_APP["search"]["search_groups"].items()), key=lambda pair: pair[1]
)
-
models = forms.MultipleChoiceField(
label="",
widget=forms.CheckboxSelectMultiple(attrs={"class": "search-filters", "form": "search-form"}),
required=False,
- choices=choices,
+ choices=model_choices,
)
- category = forms.CharField(widget=forms.HiddenInput, required=False)
- subcategory = forms.CharField(widget=forms.HiddenInput, required=False)
- from_library = forms.CharField(widget=forms.HiddenInput, required=False)
-
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -53,7 +48,24 @@ def __init__(self, *args, **kwargs):
self.helper.layout = Layout(
Field("q"),
StrictButton(_("Rechercher"), type="submit", css_class="ico-after ico-search", title=_("Rechercher")),
- Field("category"),
- Field("subcategory"),
- Field("from_library"),
)
+
+ def clean(self):
+ """Override clean() to add a field containing collections we have actually to search into."""
+ cleaned_data = super().clean()
+
+ if len(cleaned_data["models"]) == 0:
+ # Search in all collections
+ cleaned_data["search_collections"] = [
+ c for _, v in settings.ZDS_APP["search"]["search_groups"].items() for c in v[1]
+ ]
+ else:
+ # Search in collections of selected models
+ cleaned_data["search_collections"] = [
+ c
+ for k, v in settings.ZDS_APP["search"]["search_groups"].items()
+ for c in v[1]
+ if k in cleaned_data["models"]
+ ]
+
+ return cleaned_data
diff --git a/zds/searchv2/management/commands/__init__.py b/zds/search/management/__init__.py
similarity index 100%
rename from zds/searchv2/management/commands/__init__.py
rename to zds/search/management/__init__.py
diff --git a/zds/searchv2/tests/__init__.py b/zds/search/management/commands/__init__.py
similarity index 100%
rename from zds/searchv2/tests/__init__.py
rename to zds/search/management/commands/__init__.py
diff --git a/zds/search/management/commands/search_engine_manager.py b/zds/search/management/commands/search_engine_manager.py
new file mode 100644
index 0000000000..bc412ff857
--- /dev/null
+++ b/zds/search/management/commands/search_engine_manager.py
@@ -0,0 +1,67 @@
+import time
+
+from django.core.management.base import BaseCommand, CommandError
+from django.conf import settings
+
+from zds.search.utils import SearchIndexManager, get_all_indexable_classes
+
+
+class Command(BaseCommand):
+ help = "Index data in Typesense and manage them"
+
+ search_engine_manager = None
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "action",
+ type=str,
+ help="action to perform (clear: remove everything, setup: clear + create schemes, index_all: setup + index everything, index_flagged: index only what is required)",
+ choices=["setup", "clear", "index_all", "index_flagged"],
+ )
+ parser.add_argument("-q", "--quiet", action="store_true", default=False)
+
+ def handle(self, *args, **options):
+ # Removing and indexing collections can take time, so disable timeout for management.
+ self.search_engine_manager = SearchIndexManager(disable_timeout=True)
+
+ if not self.search_engine_manager.connected:
+ raise Exception("Unable to connect to the search engine, aborting.")
+
+ if options["action"] == "setup":
+ self.search_engine_manager.reset_index()
+ elif options["action"] == "clear":
+ self.search_engine_manager.clear_index()
+ elif options["action"] == "index_all":
+ self.index_documents(force_reindexing=True, quiet=options["quiet"])
+ elif options["action"] == "index_flagged":
+ self.index_documents(force_reindexing=False, quiet=options["quiet"])
+ else:
+ raise CommandError("unknown action {}".format(options["action"]))
+
+ def index_documents(self, force_reindexing=False, quiet=False):
+ verbose = not quiet
+
+ if force_reindexing:
+ self.search_engine_manager.reset_index() # remove all previous data and create schemes
+
+ global_start_time = time.time()
+
+ for model in get_all_indexable_classes(only_models=True):
+ # Models take care of indexing classes that are not models
+
+ model_start_time = time.time()
+
+ if verbose:
+ self.stdout.write(f"- indexing {model.get_search_document_type()}s")
+
+ indexed_counter = self.search_engine_manager.indexing_of_model(
+ model, force_reindexing=force_reindexing, verbose=verbose
+ )
+
+ if verbose:
+ duration = int(time.time() - model_start_time)
+ self.stdout.write(f" {indexed_counter}\titems indexed in {duration//60}min{duration%60}s")
+
+ if verbose:
+ duration = int(time.time() - global_start_time)
+ self.stdout.write(f"All done in {duration//60}min{duration%60}s")
diff --git a/zds/search/models.py b/zds/search/models.py
new file mode 100644
index 0000000000..3ebef8d980
--- /dev/null
+++ b/zds/search/models.py
@@ -0,0 +1,170 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+
+class AbstractSearchIndexable:
+ """Mixin for indexable objects.
+
+ Define a number of different functions that can be overridden to tune the
+ behavior of indexing into the search_engine.
+
+ You (may) need to override:
+
+ - ``get_indexable()``;
+ - ``get_schema()`` (not mandatory, but otherwise, the search engine will
+ choose the schema by itself);
+ - ``get_document()`` (not mandatory, but may be useful if data differ from
+ schema or extra stuffs need to be done).
+
+ You also need to maintain ``search_engine_id``, which is actually a string.
+ For objects that are also stored in the database, we use the database
+ primary key. We have to define it here (and not in child class
+ ``AbstractSearchIndexableModel``) because there are objects indexed in the
+ search engine, but not stored in the database.
+ """
+
+ search_engine_id = ""
+
+ objects_per_batch = 100
+
+ @classmethod
+ def get_search_document_type(cls):
+ """Name of the collection in the search engine for the class."""
+ return cls.__name__.lower()
+
+ @classmethod
+ def get_search_document_schema(self):
+ """Setup schema for the model (data scheme).
+
+ See https://typesense.org/docs/0.23.1/api/collections.html#with-pre-defined-schema
+
+ .. attention::
+ You *may* want to override this method (otherwise the search engine
+ choose the schema by itself).
+
+ :return: schema object. A dictionary containing the name and fields of the collection.
+ :rtype: dict
+ """
+ search_engine_schema = dict()
+ search_engine_schema["name"] = self.get_search_document_type()
+ search_engine_schema["fields"] = [{"name": ".*", "type": "auto"}]
+ search_engine_schema["symbols_to_index"] = [
+ "+", # c++
+ "#", # c#
+ ]
+ return search_engine_schema
+
+ @classmethod
+ def get_indexable(cls, force_reindexing=False):
+ """Return objects to index.
+
+ .. attention::
+ You need to override this method (otherwise nothing will be
+ indexed).
+
+ :param force_reindexing: force to return all objects, even if they may already be indexed.
+ :type force_reindexing: bool
+ :rtype: list
+ """
+
+ return []
+
+ def get_document_source(self, excluded_fields=[]):
+ """Create a document from the instance of the class, based on the schema.
+
+ .. attention::
+ You may need to override this method if the data differ from the
+ schema for some reason.
+
+ :param excluded_fields: exclude some field from the default method
+ :type excluded_fields: list
+ :return: document
+ :rtype: dict
+ """
+
+ cls = self.__class__
+ schema = cls.get_search_document_schema()["fields"]
+ fields = list(schema[i]["name"] for i in range(len(schema)))
+
+ data = {}
+
+ for field in fields:
+ if field in excluded_fields:
+ data[field] = None
+ continue
+
+ v = getattr(self, field, None)
+ if callable(v):
+ v = v()
+
+ data[field] = v
+
+ data["id"] = self.search_engine_id
+
+ return data
+
+
+class AbstractSearchIndexableModel(AbstractSearchIndexable, models.Model):
+ """Version of AbstractSearchIndexable for a Django object, with some improvements:
+
+ - Already include ``pk`` in schema;
+ - Makes the search engine ID field to be equal to the database primary key;
+ - Define a ``search_engine_requires_index`` database field to be able to index only new and modified data;
+ - Override ``save()`` to mark the object as requiring to be indexed;
+ - Define a ``get_indexable_objects()`` method that can be overridden to
+ change the queryset to fetch objects to index.
+ """
+
+ class Meta:
+ abstract = True
+
+ search_engine_requires_index = models.BooleanField(
+ _("Doit être (ré)indexé par le moteur de recherche"), default=True, db_index=True
+ )
+
+ def __init__(self, *args, **kwargs):
+ """Override to make the search engine document ID equal to the database primary key."""
+ super().__init__(*args, **kwargs)
+ self.search_engine_id = str(self.pk) if self.pk else None
+
+ @classmethod
+ def get_indexable_objects(cls, force_reindexing=False):
+ """Returns objects that will be indexed in the search engine.
+
+ This method can be overridden to filter Django objects from database
+ and prevent to index filtered out objects.
+
+ :param force_reindexing: force to return all indexable objects, even those already indexed.
+ :type force_reindexing: bool
+ :return: query
+ :rtype: django.db.models.query.QuerySet
+ """
+
+ query = cls.objects
+
+ if not force_reindexing:
+ query = query.filter(search_engine_requires_index=True)
+
+ return query
+
+ @classmethod
+ def get_indexable(cls, force_reindexing=False):
+ """Override ``get_indexable()`` in order to use the Django querysets and batch objects.
+
+ :return: a queryset
+ :rtype: django.db.models.query.QuerySet
+ """
+
+ return cls.get_indexable_objects(force_reindexing).order_by("pk").all()
+
+ def save(self, *args, **kwargs):
+ """Override the ``save()`` method to flag the object as requiring to be reindexed
+ (since a save assumes a modification of the object).
+
+ .. note::
+ Flagging can be prevented using ``save(search_engine_requires_index=False)``.
+ """
+
+ self.search_engine_requires_index = kwargs.pop("search_engine_requires_index", True)
+
+ return super().save(*args, **kwargs)
diff --git a/zds/search/tests/__init__.py b/zds/search/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/zds/search/tests/tests_models.py b/zds/search/tests/tests_models.py
new file mode 100644
index 0000000000..e45295ad46
--- /dev/null
+++ b/zds/search/tests/tests_models.py
@@ -0,0 +1,578 @@
+from copy import deepcopy
+
+from django.conf import settings
+from django.contrib.auth.models import Group
+from django.test import TestCase
+from django.test.utils import override_settings
+
+from zds.forum.tests.factories import TopicFactory, PostFactory, Topic, Post, TagFactory
+from zds.forum.tests.factories import create_category_and_forum, create_topic_in_forum
+from zds.member.tests.factories import ProfileFactory, StaffProfileFactory
+from zds.search.utils import SearchIndexManager
+from zds.tutorialv2.tests.factories import (
+ PublishableContentFactory,
+ PublishedContentFactory,
+ ContainerFactory,
+ ExtractFactory,
+ publish_content,
+)
+from zds.tutorialv2.models.database import PublishedContent, FakeChapter, PublishableContent
+from zds.tutorialv2.tests import TutorialTestMixin, override_for_contents
+from zds.utils.tests.factories import CategoryFactory, SubCategoryFactory
+
+
+overridden_zds_app = deepcopy(settings.ZDS_APP)
+overridden_zds_app["content"]["extra_content_generation_policy"] = "NONE"
+overridden_zds_app["content"]["repo_private_path"] = settings.BASE_DIR / "contents-private-test"
+overridden_zds_app["content"]["repo_public_path"] = settings.BASE_DIR / "contents-public-test"
+
+
+@override_settings(ZDS_APP=overridden_zds_app)
+@override_for_contents(SEARCH_ENABLED=True)
+class SearchIndexManagerTests(TutorialTestMixin, TestCase):
+ def setUp(self):
+ settings.EMAIL_BACKEND = "django.core.mail.backends.locmem.EmailBackend"
+ self.mas = ProfileFactory().user
+ settings.ZDS_APP["member"]["bot_account"] = self.mas.username
+
+ self.category, self.forum = create_category_and_forum()
+
+ self.user = ProfileFactory().user
+ self.staff = StaffProfileFactory().user
+
+ self.manager = SearchIndexManager()
+ self.indexable = [FakeChapter, PublishedContent, Topic, Post]
+
+ self.manager.reset_index()
+
+ def test_setup_functions(self):
+ """Test the behavior of the reset_index() and clear_index() functions"""
+
+ if not self.manager.connected:
+ return
+
+ # 1. Creation:
+ models = [Topic, Post]
+ self.manager.reset_index()
+
+ # test collection
+ for model in models:
+ self.assertTrue(model.get_search_document_type() in self.manager.collections)
+
+ # 2. Clearing
+ self.manager.clear_index()
+ self.assertTrue(len(self.manager.collections) == 0) # back to the void
+
+ def test_indexation(self):
+ """test the indexation and deletion of the different documents"""
+
+ if not self.manager.connected:
+ return
+
+ # create a topic with a post
+ topic = TopicFactory(forum=self.forum, author=self.user)
+ post = PostFactory(topic=topic, author=self.user, position=1)
+
+ topic = Topic.objects.get(pk=topic.pk)
+ post = Post.objects.get(pk=post.pk)
+
+ self.assertTrue(topic.search_engine_requires_index)
+ self.assertTrue(post.search_engine_requires_index)
+
+ # create a middle-tutorial and publish it
+ tuto = PublishableContentFactory(type="TUTORIAL")
+ tuto.authors.add(self.user)
+ tuto.save()
+
+ tuto_draft = tuto.load_version()
+ chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto)
+ ExtractFactory(container=chapter1, db_object=tuto)
+ published = publish_content(tuto, tuto_draft, is_major_update=True)
+
+ tuto.sha_public = tuto_draft.current_version
+ tuto.sha_draft = tuto_draft.current_version
+ tuto.public_version = published
+ tuto.save()
+
+ published = PublishedContent.objects.get(content_pk=tuto.pk)
+ self.assertTrue(published.search_engine_requires_index)
+
+ # 1. index all
+ for model in self.indexable:
+ if model is FakeChapter:
+ continue
+ self.manager.indexing_of_model(model, force_reindexing=False)
+
+ topic = Topic.objects.get(pk=topic.pk)
+ post = Post.objects.get(pk=post.pk)
+
+ self.assertFalse(topic.search_engine_requires_index)
+ self.assertFalse(post.search_engine_requires_index)
+
+ published = PublishedContent.objects.get(content_pk=tuto.pk)
+ self.assertFalse(published.search_engine_requires_index)
+
+ results = self.manager.search("*") # get all documents
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 4) # get 4 results, one of each type
+
+ must_contain = {"post": False, "topic": False, "publishedcontent": False, "chapter": False}
+ id_must_be = {
+ "post": str(post.pk),
+ "topic": str(topic.pk),
+ "publishedcontent": str(published.pk),
+ "chapter": tuto.slug + "__" + chapter1.slug,
+ }
+
+ for result in results:
+ doc_type = result["request_params"]["collection_name"]
+ must_contain[doc_type] = True
+ for hit in result["hits"]:
+ doc_id = hit["document"]["id"]
+ self.assertEqual(doc_id, id_must_be[doc_type])
+
+ self.assertTrue(all(must_contain))
+
+ # 2. Test what reindexation will do:
+ new_topic = TopicFactory(forum=self.forum, author=self.user)
+ new_post = PostFactory(topic=new_topic, author=self.user, position=1)
+
+ pk_of_topics_to_reindex = []
+ for item in Topic.get_indexable(force_reindexing=False):
+ pk_of_topics_to_reindex.append(item.pk)
+
+ pk_of_posts_to_reindex = []
+ for item in Post.get_indexable(force_reindexing=False):
+ pk_of_posts_to_reindex.append(item.pk)
+
+ self.assertTrue(topic.pk not in pk_of_topics_to_reindex)
+ self.assertTrue(new_topic.pk in pk_of_topics_to_reindex)
+ self.assertTrue(post.pk not in pk_of_posts_to_reindex)
+ self.assertTrue(new_post.pk in pk_of_posts_to_reindex)
+
+ for model in self.indexable: # ok, so let's index that
+ if model is FakeChapter:
+ continue
+ self.manager.indexing_of_model(model, force_reindexing=False)
+
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 6) # good!
+
+ # 3. Test single deletion:
+ new_post = Post.objects.get(pk=new_post.pk)
+
+ self.manager.delete_document(new_post)
+
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 5) # one is missing
+
+ for result in results:
+ doc_type = result["request_params"]["collection_name"]
+ for hit in result["hits"]:
+ doc_id = hit["document"]["id"]
+ self.assertTrue(doc_type != Post.get_search_document_type() or doc_id != new_post.search_engine_id)
+
+ # 4. Test "delete_by_query_deletion":
+ topic = Topic.objects.get(pk=topic.pk)
+ new_topic = Topic.objects.get(pk=new_topic.pk)
+
+ self.manager.delete_by_query(
+ Topic.get_search_document_type(),
+ {"filter_by": f"id:= [{topic.search_engine_id}, {new_topic.search_engine_id}]"},
+ ) # the two topic are deleted
+
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 3)
+
+ for result in results:
+ doc_type = result["request_params"]["collection_name"]
+ for hit in result["hits"]:
+ doc_id = hit["document"]["id"]
+ self.assertTrue(doc_type != Topic.get_search_document_type() or doc_id != new_topic.search_engine_id)
+ self.assertTrue(doc_type != Topic.get_search_document_type() or doc_id != topic.search_engine_id)
+
+ # 5. Test that the deletion of an object also triggers its deletion in Typesense
+ post = Post.objects.get(pk=post.pk)
+ post.delete()
+
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 2)
+
+ for result in results:
+ doc_type = result["request_params"]["collection_name"]
+ for hit in result["hits"]:
+ doc_id = hit["document"]["id"]
+ self.assertTrue(doc_type != Post.get_search_document_type() or doc_id != post.search_engine_id)
+
+ # 6. Test full desindexation:
+ self.manager.reset_index()
+
+ # note "topic" is gone since "post" is gone, due to relationships at the Django level
+ new_topic = Topic.objects.get(pk=new_topic.pk)
+ new_post = Post.objects.get(pk=new_post.pk)
+
+ self.assertTrue(new_topic.search_engine_requires_index)
+ self.assertTrue(new_post.search_engine_requires_index)
+
+ published = PublishedContent.objects.get(content_pk=tuto.pk)
+ self.assertTrue(published.search_engine_requires_index)
+
+ def test_special_case_of_contents(self):
+ """test that the old publishedcontent does not stay when a new one is created"""
+
+ if not self.manager.connected:
+ return
+
+ # 1. Create a middle-tutorial, publish it, then index it
+ tuto = PublishableContentFactory(type="TUTORIAL")
+ tuto.authors.add(self.user)
+ tuto.save()
+
+ tuto_draft = tuto.load_version()
+ chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto)
+ ExtractFactory(container=chapter1, db_object=tuto)
+ published = publish_content(tuto, tuto_draft, is_major_update=True)
+
+ tuto.sha_public = tuto_draft.current_version
+ tuto.sha_draft = tuto_draft.current_version
+ tuto.public_version = published
+ tuto.save()
+
+ self.manager.indexing_of_model(PublishedContent, force_reindexing=True, verbose=False) # index
+
+ first_publication = PublishedContent.objects.get(content_pk=tuto.pk)
+ self.assertFalse(first_publication.search_engine_requires_index)
+
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 2) # get 2 results, one for the content and one for the chapter
+
+ self.assertEqual(PublishedContent.objects.count(), 1)
+
+ # 2. Change thet title, which will trigger a change in the slug
+ tuto = PublishableContent.objects.get(pk=tuto.pk)
+ versioned = tuto.load_version(sha=tuto.sha_draft)
+
+ tuto.title = "un titre complètement différent!"
+ tuto.save(force_slug_update=True)
+
+ versioned.repo_update_top_container(tuto.title, tuto.slug, "osef", "osef")
+ second_publication = publish_content(tuto, versioned, True)
+
+ tuto.sha_public = versioned.current_version
+ tuto.sha_draft = versioned.current_version
+ tuto.public_version = second_publication
+ tuto.save()
+
+ self.assertEqual(PublishedContent.objects.count(), 2) # now there is two objects ...
+ first_publication = PublishedContent.objects.get(pk=first_publication.pk)
+ self.assertTrue(first_publication.must_redirect) # .. including the first one, for redirection
+
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 0) # the old one is gone (and we need to reindex to get the new one)
+
+ # 3. Check if indexation brings the new one, and not the old one
+ self.manager.indexing_of_model(PublishedContent, force_reindexing=True, verbose=False) # index
+
+ first_publication = PublishedContent.objects.get(pk=first_publication.pk)
+ second_publication = PublishedContent.objects.get(pk=second_publication.pk)
+
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 2) # Still 2, not 4 !
+
+ found_old = False
+ found_new = False
+
+ for result in results:
+ doc_type = result["request_params"]["collection_name"]
+ for hit in result["hits"]:
+ doc_id = hit["document"]["id"]
+ if doc_type == PublishedContent.get_search_document_type():
+ if doc_id == first_publication.search_engine_id:
+ found_old = True
+ if doc_id == second_publication.search_engine_id:
+ found_new = True
+
+ self.assertTrue(found_new)
+ self.assertFalse(found_old)
+
+ def test_update_topic(self):
+ """test that changing an attribute of a topic marks it as to index"""
+
+ if not self.manager.connected:
+ return
+
+ group = Group.objects.create(name="DummyGroup_1")
+ self.user.groups.add(group)
+ self.user.save()
+
+ _, other_forum = create_category_and_forum()
+ _, private_forum = create_category_and_forum(group)
+
+ other_topic = TopicFactory(forum=other_forum, author=self.user)
+ other_topic.save(search_engine_requires_index=False)
+
+ topic = TopicFactory(forum=self.forum, author=self.user)
+ topic.save(search_engine_requires_index=False)
+
+ private_topic = TopicFactory(forum=self.forum, author=self.user)
+ private_topic.save(search_engine_requires_index=True)
+
+ # index all topics
+ self.manager.indexing_of_model(Topic, force_reindexing=False)
+ results = self.manager.search("*") # get all documents
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 1)
+
+ # Move the private topic to a private forum
+ private_topic = Topic.objects.get(pk=private_topic.pk) # to get the search_engine_id
+ private_topic.forum = private_forum
+ private_topic.save()
+ private_topic.refresh_from_db()
+ self.assertTrue(private_topic.search_engine_requires_index)
+ # the topic was removed from the search engine:
+ results = self.manager.search("*") # get all documents
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 0)
+
+ # Rename the forum (changes forum_title)
+ topic.save(search_engine_requires_index=False)
+ self.forum.title = "Other title"
+ self.forum.save()
+ topic.refresh_from_db()
+ self.assertTrue(topic.search_engine_requires_index)
+
+ # Move the topic to another forum (changes forum_pk and forum_title)
+ topic.save(search_engine_requires_index=False)
+ topic.forum = other_forum
+ topic.save()
+ topic.refresh_from_db()
+ self.assertTrue(topic.search_engine_requires_index)
+
+ # Update title:
+ topic.save(search_engine_requires_index=False)
+ topic.title = "Changed title"
+ topic.save()
+ self.assertTrue(topic.search_engine_requires_index)
+
+ # Update subtitle:
+ topic.save(search_engine_requires_index=False)
+ topic.subtitle = "Changed subtitle"
+ topic.save()
+ self.assertTrue(topic.search_engine_requires_index)
+
+ # Add a tag:
+ topic.save(search_engine_requires_index=False)
+ tag = TagFactory()
+ tag.save()
+ topic.tags.add(tag)
+ topic.save()
+ self.assertTrue(topic.search_engine_requires_index)
+
+ # Rename the tag:
+ topic.save(search_engine_requires_index=False)
+ tag.title = "New tag"
+ tag.save()
+ topic.refresh_from_db()
+ self.assertTrue(topic.search_engine_requires_index)
+
+ # Change the locked status
+ topic.save(search_engine_requires_index=False)
+ topic.is_locked = not topic.is_locked
+ topic.save()
+ topic.refresh_from_db()
+ self.assertTrue(topic.search_engine_requires_index)
+
+ # Change the solved status
+ topic.save(search_engine_requires_index=False)
+ topic.solved_by = self.user
+ topic.save()
+ topic.refresh_from_db()
+ self.assertTrue(topic.search_engine_requires_index)
+
+ # Change the sticky status
+ topic.save(search_engine_requires_index=False)
+ topic.is_sticky = not topic.is_sticky
+ topic.save()
+ topic.refresh_from_db()
+ self.assertTrue(topic.search_engine_requires_index)
+
+ # It did not impact other topics:
+ other_topic.refresh_from_db()
+ self.assertFalse(other_topic.search_engine_requires_index)
+
+ def test_update_post(self):
+ """test that changing an attribute of a post marks it as to index"""
+
+ if not self.manager.connected:
+ return
+
+ group = Group.objects.create(name="DummyGroup_1")
+ self.user.groups.add(group)
+ self.user.save()
+
+ _, other_forum = create_category_and_forum()
+ _, private_forum = create_category_and_forum(group)
+
+ private_topic = TopicFactory(forum=self.forum, author=self.user)
+ private_topic.save(search_engine_requires_index=True)
+ private_post = PostFactory(topic=private_topic, author=self.user, position=2)
+ private_post.save(search_engine_requires_index=True)
+
+ topic = create_topic_in_forum(self.forum, self.user.profile)
+ topic.save(search_engine_requires_index=False)
+ post = PostFactory(topic=topic, author=self.user, position=2)
+ post.save(search_engine_requires_index=False)
+
+ other_topic = create_topic_in_forum(self.forum, self.user.profile)
+ other_topic.save(search_engine_requires_index=False)
+ other_post = PostFactory(topic=other_topic, author=self.user, position=2)
+ other_post.save(search_engine_requires_index=False)
+
+ # index all posts
+ self.manager.indexing_of_model(Post, force_reindexing=False)
+ results = self.manager.search("*") # get all documents
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 3)
+
+ # Move the private topic to a private forum
+ private_topic.forum = private_forum
+ private_topic.save()
+ private_topic.refresh_from_db()
+ self.assertTrue(private_topic.search_engine_requires_index)
+ # the post was removed from the search engine:
+ results = self.manager.search("*") # get all documents
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 2)
+
+ # Move the topic to another forum (changes forum_pk)
+ post.save(search_engine_requires_index=False)
+ post.topic.forum = other_forum
+ post.topic.save()
+ post.refresh_from_db()
+ self.assertTrue(post.search_engine_requires_index)
+
+ # Change the topic title (changes topic_title)
+ post.save(search_engine_requires_index=False)
+ post.topic.title = "Changed title"
+ post.topic.save()
+ post.refresh_from_db()
+ self.assertTrue(post.search_engine_requires_index)
+
+ # Change the forum title (changes forum_title)
+ post.save(search_engine_requires_index=False)
+ post.topic.forum.title = "Changed title"
+ post.topic.forum.save()
+ post.refresh_from_db()
+ self.assertTrue(post.search_engine_requires_index)
+
+ # Change the content
+ post.save(search_engine_requires_index=False)
+ post.text = "New text"
+ post.save()
+ post.refresh_from_db()
+ self.assertTrue(post.search_engine_requires_index)
+
+ # Mark it as useful
+ post.save(search_engine_requires_index=False)
+ post.is_useful = not post.is_useful
+ post.save()
+ post.refresh_from_db()
+ self.assertTrue(post.search_engine_requires_index)
+
+ # Like it
+ post.save(search_engine_requires_index=False)
+ post.like += 1
+ post.save()
+ post.refresh_from_db()
+ self.assertTrue(post.search_engine_requires_index)
+
+ # Dislike it
+ post.save(search_engine_requires_index=False)
+ post.dislike += 1
+ post.save()
+ post.refresh_from_db()
+ self.assertTrue(post.search_engine_requires_index)
+
+ # It did not impact other posts:
+ other_post.refresh_from_db()
+ self.assertFalse(other_post.search_engine_requires_index)
+
+ def test_update_published_content(self):
+ """
+ Test that changing an attribute of a published content marks it as to
+ index.
+
+ No need to test the update of FakeChapter, since the reindex of a
+ published content starts by removing all its fake chapters from the
+ search engine.
+ """
+
+ if not self.manager.connected:
+ return
+
+ published_content = PublishedContentFactory().public_version
+ published_content.save(search_engine_requires_index=False)
+
+ other_published_content = PublishedContentFactory().public_version
+ other_published_content.save(search_engine_requires_index=False)
+
+ tag = TagFactory()
+ tag.save()
+
+ category = CategoryFactory()
+ category.save()
+
+ subcategory = SubCategoryFactory(category=category)
+ subcategory.save()
+
+ # Add a tag
+ published_content.save(search_engine_requires_index=False)
+ published_content.content.tags.add(tag)
+ published_content.content.save()
+ published_content.refresh_from_db()
+ self.assertTrue(published_content.search_engine_requires_index)
+
+ # Rename the tag
+ published_content.save(search_engine_requires_index=False)
+ tag.title = "New tag"
+ tag.save()
+ published_content.refresh_from_db()
+ self.assertTrue(published_content.search_engine_requires_index)
+
+ # Add a subcategory
+ published_content.save(search_engine_requires_index=False)
+ published_content.content.subcategory.add(subcategory)
+ published_content.content.save()
+ published_content.refresh_from_db()
+ self.assertTrue(published_content.search_engine_requires_index)
+
+ # Rename the subcategory
+ published_content.save(search_engine_requires_index=False)
+ subcategory.title = "New subcategory"
+ subcategory.save()
+ published_content.refresh_from_db()
+ self.assertTrue(published_content.search_engine_requires_index)
+
+ # Rename the category
+ published_content.save(search_engine_requires_index=False)
+ category.title = "New category"
+ category.save()
+ published_content.refresh_from_db()
+ self.assertTrue(published_content.search_engine_requires_index)
+
+ # It did not impact other contents:
+ other_published_content.refresh_from_db()
+ self.assertFalse(other_published_content.search_engine_requires_index)
+
+ def tearDown(self):
+ super().tearDown()
+
+ # delete index:
+ self.manager.clear_index()
diff --git a/zds/search/tests/tests_utils.py b/zds/search/tests/tests_utils.py
new file mode 100644
index 0000000000..d7844a41fb
--- /dev/null
+++ b/zds/search/tests/tests_utils.py
@@ -0,0 +1,209 @@
+from copy import deepcopy
+import os
+
+from django.conf import settings
+from django.test import TestCase
+from django.test.utils import override_settings
+from django.core.management import call_command
+
+from zds.member.tests.factories import ProfileFactory, StaffProfileFactory
+from zds.tutorialv2.tests.factories import PublishableContentFactory, ContainerFactory, ExtractFactory
+from zds.tutorialv2.models.database import PublishedContent
+from zds.tutorialv2.publication_utils import publish_content
+from zds.forum.tests.factories import TopicFactory, PostFactory, Topic, Post
+from zds.forum.tests.factories import create_category_and_forum
+from zds.search.utils import SearchFilter, SearchIndexManager
+from zds.tutorialv2.tests import TutorialTestMixin, override_for_contents
+
+
+overridden_zds_app = deepcopy(settings.ZDS_APP)
+overridden_zds_app["content"]["extra_content_generation_policy"] = "NONE"
+overridden_zds_app["content"]["repo_private_path"] = settings.BASE_DIR / "contents-private-test"
+overridden_zds_app["content"]["repo_public_path"] = settings.BASE_DIR / "contents-public-test"
+
+
+@override_settings(ZDS_APP=overridden_zds_app)
+@override_for_contents(SEARCH_ENABLED=True)
+class UtilsTests(TutorialTestMixin, TestCase):
+ def setUp(self):
+ settings.EMAIL_BACKEND = "django.core.mail.backends.locmem.EmailBackend"
+ self.mas = ProfileFactory().user
+ settings.ZDS_APP["member"]["bot_account"] = self.mas.username
+
+ self.category, self.forum = create_category_and_forum()
+
+ self.user = ProfileFactory().user
+ self.staff = StaffProfileFactory().user
+
+ self.search_engine_manager = SearchIndexManager()
+
+ def test_manager(self):
+ """Test the behavior of the ``search_engine_manager`` command"""
+
+ if not self.search_engine_manager.connected:
+ return
+
+ def call_search_engine_manager_command(cmd: str):
+ with open(os.devnull, "w") as f:
+ call_command("search_engine_manager", "--quiet", cmd, stdout=f, stderr=f)
+
+ # in the beginning: the void
+ self.assertEqual(len(self.search_engine_manager.collections), 0)
+
+ text = "Ceci est un texte de test"
+
+ # create a topic with a post
+ topic = TopicFactory(forum=self.forum, author=self.user, title=text)
+ post = PostFactory(topic=topic, author=self.user, position=1)
+ post.text = post.text_html = text
+ post.save()
+
+ topic = Topic.objects.get(pk=topic.pk)
+ post = Post.objects.get(pk=post.pk)
+
+ self.assertTrue(topic.search_engine_requires_index)
+ self.assertTrue(post.search_engine_requires_index)
+
+ # create a middle-tutorial and publish it
+ tuto = PublishableContentFactory(type="TUTORIAL")
+ tuto.authors.add(self.user)
+ tuto.save()
+
+ tuto_draft = tuto.load_version()
+ chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto)
+ chapter1.repo_update(text, text, text)
+ extract1 = ExtractFactory(container=chapter1, db_object=tuto)
+ version = extract1.repo_update(text, text)
+ published = publish_content(tuto, tuto_draft, is_major_update=True)
+
+ tuto.sha_public = version
+ tuto.sha_draft = version
+ tuto.public_version = published
+ tuto.save()
+
+ published = PublishedContent.objects.get(content_pk=tuto.pk)
+ self.assertTrue(published.search_engine_requires_index)
+
+ # 1. test "index-all"
+ call_search_engine_manager_command("index_all")
+
+ self.assertNotEqual(len(self.search_engine_manager.collections), 0)
+
+ topic = Topic.objects.get(pk=topic.pk)
+ post = Post.objects.get(pk=post.pk)
+
+ self.assertFalse(topic.search_engine_requires_index)
+ self.assertFalse(post.search_engine_requires_index)
+
+ published = PublishedContent.objects.get(content_pk=tuto.pk)
+ self.assertFalse(published.search_engine_requires_index)
+
+ results = self.search_engine_manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 4) # get 4 results, one of each type
+
+ must_contain = {"post": False, "topic": False, "publishedcontent": False, "chapter": False}
+ id_must_be = {
+ "post": str(post.pk),
+ "topic": str(topic.pk),
+ "publishedcontent": str(published.pk),
+ "chapter": tuto.slug + "__" + chapter1.slug,
+ }
+
+ for result in results:
+ doc_type = result["request_params"]["collection_name"]
+ must_contain[doc_type] = True
+ for hit in result["hits"]:
+ doc_id = hit["document"]["id"]
+ self.assertEqual(doc_id, id_must_be[doc_type])
+
+ self.assertTrue(all(must_contain))
+
+ # 2. test "clear"
+ self.assertNotEqual(len(self.search_engine_manager.collections), 0)
+
+ call_search_engine_manager_command("clear")
+ self.assertEqual(len(self.search_engine_manager.collections), 0) # back to void
+
+ # must reset every object
+ topic = Topic.objects.get(pk=topic.pk)
+ post = Post.objects.get(pk=post.pk)
+
+ self.assertTrue(topic.search_engine_requires_index)
+ self.assertTrue(post.search_engine_requires_index)
+
+ published = PublishedContent.objects.get(content_pk=tuto.pk)
+ self.assertTrue(published.search_engine_requires_index)
+
+ # 3. test "setup"
+ call_search_engine_manager_command("setup")
+
+ self.assertNotEqual(len(self.search_engine_manager.collections), 0) # collections back in
+
+ results = self.search_engine_manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 0) # ... but with nothing in it
+
+ # 4. test "index_flagged"
+ call_search_engine_manager_command("index_flagged")
+
+ topic = Topic.objects.get(pk=topic.pk)
+ post = Post.objects.get(pk=post.pk)
+
+ self.assertFalse(topic.search_engine_requires_index)
+ self.assertFalse(post.search_engine_requires_index)
+
+ published = PublishedContent.objects.get(content_pk=tuto.pk)
+ self.assertFalse(published.search_engine_requires_index)
+
+ results = self.search_engine_manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 4) # get the 4 results back
+
+ # 5. test "index_flagged" with something to update
+ topic.search_engine_requires_index = True
+ topic.save()
+ post.search_engine_requires_index = True
+ post.save()
+ published.search_engine_requires_index = True
+ published.save()
+
+ call_search_engine_manager_command("index_flagged")
+
+ topic = Topic.objects.get(pk=topic.pk)
+ post = Post.objects.get(pk=post.pk)
+
+ self.assertFalse(topic.search_engine_requires_index)
+ self.assertFalse(post.search_engine_requires_index)
+
+ published = PublishedContent.objects.get(content_pk=tuto.pk)
+ self.assertFalse(published.search_engine_requires_index)
+
+ results = self.search_engine_manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 4) # get the 4 results back
+
+ def tearDown(self):
+ super().tearDown()
+
+ # delete index:
+ self.search_engine_manager.clear_index()
+
+
+class SearchFilterTests(TestCase):
+ def test_search_filter(self):
+ f = SearchFilter()
+
+ f.add_exact_filter("foo", [1])
+ self.assertEqual(str(f), "(foo:=[1])")
+
+ f.add_exact_filter("bar", [3, 4, "bla"])
+ self.assertEqual(str(f), "(foo:=[1]) && (bar:=[3,4,bla])")
+
+ f.add_bool_filter("z", True)
+ self.assertEqual(str(f), "(foo:=[1]) && (bar:=[3,4,bla]) && (z:true)")
+
+ f = SearchFilter()
+
+ f.add_not_numerical_filter("forum_pk", [6, 7])
+ self.assertEqual(str(f), "(forum_pk:!=[6,7])")
diff --git a/zds/searchv2/tests/tests_views.py b/zds/search/tests/tests_views.py
similarity index 50%
rename from zds/searchv2/tests/tests_views.py
rename to zds/search/tests/tests_views.py
index 8e906a9098..d53f33a6bc 100644
--- a/zds/searchv2/tests/tests_views.py
+++ b/zds/search/tests/tests_views.py
@@ -1,19 +1,18 @@
-from zds import json_handler
+from copy import deepcopy
import datetime
-
-from elasticsearch_dsl import Search
-from elasticsearch_dsl.query import MatchAll
+from math import ceil
from django.conf import settings
+from django.contrib.auth.models import Group
from django.test import TestCase
+from django.test.utils import override_settings
from django.urls import reverse
-from django.contrib.auth.models import Group
+from zds import json_handler
from zds.forum.tests.factories import TopicFactory, PostFactory, Topic, Post, TagFactory
from zds.forum.tests.factories import create_category_and_forum
-
from zds.member.tests.factories import ProfileFactory, StaffProfileFactory
-from zds.searchv2.models import ESIndexManager
+from zds.search.utils import SearchIndexManager
from zds.tutorialv2.tests.factories import (
PublishableContentFactory,
ContainerFactory,
@@ -26,7 +25,14 @@
from zds.tutorialv2.tests import TutorialTestMixin, override_for_contents
-@override_for_contents(ES_ENABLED=True, ES_SEARCH_INDEX={"name": "zds_search_test", "shards": 1, "replicas": 0})
+overridden_zds_app = deepcopy(settings.ZDS_APP)
+overridden_zds_app["content"]["extra_content_generation_policy"] = "NONE"
+overridden_zds_app["content"]["repo_private_path"] = settings.BASE_DIR / "contents-private-test"
+overridden_zds_app["content"]["repo_public_path"] = settings.BASE_DIR / "contents-public-test"
+
+
+@override_settings(ZDS_APP=overridden_zds_app)
+@override_for_contents(SEARCH_ENABLED=True)
class ViewsTests(TutorialTestMixin, TestCase):
def setUp(self):
settings.EMAIL_BACKEND = "django.core.mail.backends.locmem.EmailBackend"
@@ -38,23 +44,31 @@ def setUp(self):
self.user = ProfileFactory().user
self.staff = StaffProfileFactory().user
- self.manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
+ self.manager = SearchIndexManager()
self.indexable = [FakeChapter, PublishedContent, Topic, Post]
- self.manager.reset_es_index(self.indexable)
- self.manager.setup_custom_analyzer()
- self.manager.refresh_index()
+ self.manager.reset_index()
+
+ def _index_everything(self):
+ self.manager.reset_index()
+ for model in self.indexable:
+ if model is FakeChapter:
+ continue
+ self.manager.indexing_of_model(model, force_reindexing=True, verbose=False)
def test_basic_search(self):
"""Basic search and filtering"""
- if not self.manager.connected_to_es:
+ if not self.manager.connected:
return
+ tag = TagFactory(title="Clémentine à pépins") # with accents to make a different slug
+
# 1. Index and test search:
text = "test"
topic_1 = TopicFactory(forum=self.forum, author=self.user, title=text)
+ topic_1.tags.add(tag)
post_1 = PostFactory(topic=topic_1, author=self.user, position=1)
post_1.text = post_1.text_html = text
post_1.save()
@@ -63,6 +77,7 @@ def test_basic_search(self):
tuto = PublishableContentFactory(type="TUTORIAL")
tuto_draft = tuto.load_version()
+ tuto.tags.add(tag)
tuto.title = text
tuto.authors.add(self.user)
tuto.save()
@@ -81,21 +96,28 @@ def test_basic_search(self):
tuto.save()
# nothing has been indexed yet:
- self.assertEqual(len(self.manager.setup_search(Search().query(MatchAll())).execute()), 0)
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 0) # good!
# index
- for model in self.indexable:
- if model is FakeChapter:
- continue
- self.manager.es_bulk_indexing_of_model(model)
- self.manager.refresh_index()
+ self._index_everything()
result = self.client.get(reverse("search:query") + "?q=" + text, follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
+ response = result.context["object_list"]
+
+ self.assertEqual(len(response), 4) # get 4 results
- self.assertEqual(response.hits.total, 4) # get 4 results
+ # Ugly hack to search only in search results. In the menu in the
+ # header, the last tags are showed, but this section is cached:
+ # depending on which tests are launched and in which order, the menu
+ # may contain or not these tags.
+ content_search_results = result.content.decode()[result.content.decode().find("search-results") :]
+ # The tag appears 2 times: in two search results
+ self.assertEqual(content_search_results.count(tag.title), 2)
+ self.assertEqual(content_search_results.count(tag.slug), 2)
# 2. Test filtering:
topic_1 = Topic.objects.get(pk=topic_1.pk)
@@ -103,29 +125,93 @@ def test_basic_search(self):
published = PublishedContent.objects.get(pk=published.pk)
ids = {
- "topic": [topic_1.es_id],
- "post": [post_1.es_id],
- "content": [published.es_id, published.content_public_slug + "__" + chapter1.slug],
+ "topic": [topic_1.search_engine_id],
+ "post": [post_1.search_engine_id],
+ "publishedcontent": [published.search_engine_id, published.content_public_slug + "__" + chapter1.slug],
}
- search_groups = [k for k, v in settings.ZDS_APP["search"]["search_groups"].items()]
- group_to_model = {k: v[1] for k, v in settings.ZDS_APP["search"]["search_groups"].items()}
-
- for doc_type in search_groups:
+ for doc_type in settings.ZDS_APP["search"]["search_groups"]:
result = self.client.get(reverse("search:query") + "?q=" + text + "&models=" + doc_type, follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
-
- self.assertEqual(response.hits.total, len(ids[doc_type])) # get 1 result of each …
+ response = result.context["object_list"]
+ self.assertEqual(len(response), len(ids[doc_type])) # get 1 result of each …
for i, r in enumerate(response):
- self.assertIn(r.meta.doc_type, group_to_model[doc_type]) # … and only of the right type …
- self.assertEqual(r.meta.id, ids[doc_type][i]) # … with the right id !
+ self.assertIn(
+ r["collection"], settings.ZDS_APP["search"]["search_groups"][doc_type][1]
+ ) # … and only of the right type …
+ self.assertEqual(r["document"]["id"], ids[doc_type][i]) # … with the right id !
+
+ def test_search_many_pages(self):
+ if not self.manager.connected:
+ return
+
+ text = "foo"
+ url = reverse("search:query") + "?q=" + text
+ results_per_page = settings.ZDS_APP["search"]["results_per_page"]
+
+ # 1. There are less than 250 results per collection
+ nb_topics = 150
+ nb_pages = ceil(2 * nb_topics / results_per_page)
+
+ for i in range(nb_topics):
+ topic = TopicFactory(forum=self.forum, author=self.user, title=text)
+ post = PostFactory(topic=topic, author=self.user, position=1)
+ post.text = post.text_html = text
+ post.save()
+
+ self._index_everything()
+
+ result = self.client.get(url, follow=False)
+ self.assertEqual(result.status_code, 200)
+ self.assertEqual(result.context["paginator"].num_pages, nb_pages)
+ self.assertEqual(len(result.context["object_list"]), results_per_page)
+
+ result = self.client.get(f"{url}&page={nb_pages}", follow=False)
+ self.assertEqual(result.status_code, 200)
+ self.assertEqual(len(result.context["object_list"]), 2 * nb_topics - (nb_pages - 1) * results_per_page)
+ self.assertFalse(result.context["has_more_results"])
+
+ # 2. There are more than 250 results per collection
+ nb_pages = ceil(2 * min(2 * nb_topics, 250) / results_per_page)
+
+ # Append 150 new topics, making it > 250
+ for i in range(nb_topics):
+ topic = TopicFactory(forum=self.forum, author=self.user, title=text)
+ post = PostFactory(topic=topic, author=self.user, position=1)
+ post.text = post.text_html = text
+ post.save()
+
+ self._index_everything()
+
+ result = self.client.get(url, follow=False)
+ self.assertEqual(result.status_code, 200)
+ self.assertEqual(result.context["paginator"].num_pages, nb_pages)
+ self.assertEqual(len(result.context["object_list"]), results_per_page)
+
+ result = self.client.get(f"{url}&page={nb_pages}", follow=False)
+ self.assertEqual(result.status_code, 200)
+ self.assertEqual(len(result.context["object_list"]), results_per_page)
+ self.assertTrue(result.context["has_more_results"])
+
+ def test_invalid_search(self):
+ if not self.manager.connected:
+ return
+
+ # Check if the request is *, no result is displayed
+ result = self.client.get(reverse("search:query") + "?q=*", follow=False)
+ self.assertEqual(result.status_code, 200)
+ self.assertEqual(len(result.context["object_list"]), 0)
+
+ # Check if there is no query parametern there is no error:
+ result = self.client.get(reverse("search:query"), follow=False)
+ self.assertEqual(result.status_code, 200)
+ self.assertEqual(len(result.context["object_list"]), 0)
def test_get_similar_topics(self):
"""Get similar topics lists"""
- if not self.manager.connected_to_es:
+ if not self.manager.connected:
return
text = "Clem ne se mange pas"
@@ -142,35 +228,53 @@ def test_get_similar_topics(self):
post_2.text = post_1.text_html = text
post_2.save()
- # 1. Should not get any result
+ # Create a hidden forum with a matching topic that should not show up
+ group = Group.objects.create(name="Les illuminatis anonymes de ZdS")
+ _, hidden_forum = create_category_and_forum(group)
+
+ topic_hidden = TopicFactory(forum=hidden_forum, author=self.staff, title=text)
+ post_hidden = PostFactory(topic=topic_hidden, author=self.user, position=1)
+ post_hidden.text = post_hidden.text_html = text
+ post_hidden.save()
+
+ # Should not get any result
result = self.client.get(reverse("search:similar") + "?q=est", follow=False)
self.assertEqual(result.status_code, 200)
content = json_handler.loads(result.content.decode("utf-8"))
self.assertEqual(len(content["results"]), 0)
- # index
- for model in self.indexable:
- if model is FakeChapter:
- continue
- self.manager.es_bulk_indexing_of_model(model)
- self.manager.refresh_index()
+ # Should not get a 500 if collections do not exist:
+ self.manager.clear_index()
+ result = self.client.get(reverse("search:similar") + "?q=mange", follow=False)
+ self.assertEqual(result.status_code, 200)
+ content = json_handler.loads(result.content.decode("utf-8"))
+ self.assertEqual(len(content["results"]), 0)
- # 2. Should get exactly one result
+ # create collections and index content:
+ self._index_everything()
+
+ # Should get exactly one result
result = self.client.get(reverse("search:similar") + "?q=mange", follow=False)
self.assertEqual(result.status_code, 200)
content = json_handler.loads(result.content.decode("utf-8"))
self.assertEqual(len(content["results"]), 1)
- # 2. Should get exactly two results
+ # Should get exactly two results
result = self.client.get(reverse("search:similar") + "?q=Clem", follow=False)
self.assertEqual(result.status_code, 200)
content = json_handler.loads(result.content.decode("utf-8"))
self.assertEqual(len(content["results"]), 2)
- def test_hidden_post_are_not_result(self):
+ # Should not get any result:
+ result = self.client.get(reverse("search:similar") + "?q=*", follow=False)
+ self.assertEqual(result.status_code, 200)
+ content = json_handler.loads(result.content.decode("utf-8"))
+ self.assertEqual(len(content["results"]), 0)
+
+ def test_hidden_post_are_not_in_results(self):
"""Hidden posts should not show up in the search results"""
- if not self.manager.connected_to_es:
+ if not self.manager.connected:
return
# 1. Index and test search:
@@ -181,41 +285,42 @@ def test_hidden_post_are_not_result(self):
post_1.text = post_1.text_html = text
post_1.save()
- self.manager.es_bulk_indexing_of_model(Topic)
- self.manager.es_bulk_indexing_of_model(Post)
- self.manager.refresh_index()
+ self.manager.indexing_of_model(Topic)
+ self.manager.indexing_of_model(Post)
- self.assertEqual(len(self.manager.setup_search(Search().query(MatchAll())).execute()), 2) # indexing ok
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 2) # indexing ok
post_1 = Post.objects.get(pk=post_1.pk)
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
+ response = result.context["object_list"]
- self.assertEqual(response.hits.total, 1)
- self.assertEqual(response[0].meta.id, post_1.es_id)
+ self.assertEqual(len(response), 1)
+ self.assertEqual(response[0]["document"]["get_absolute_url"], post_1.get_absolute_url())
+ self.assertEqual(response[0]["document"]["topic_pk"], post_1.topic.pk)
# 2. Hide, reindex and search again:
post_1.hide_comment_by_user(self.staff, "Un abus de pouvoir comme un autre ;)")
- self.manager.refresh_index()
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 0) # nothing in the results
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 0) # nothing in the results
def test_hidden_forums_give_no_results_if_user_not_allowed(self):
"""Long name, isn't ?"""
- if not self.manager.connected_to_es:
+ if not self.manager.connected:
return
# 1. Create a hidden forum belonging to a hidden staff group.
@@ -232,18 +337,19 @@ def test_hidden_forums_give_no_results_if_user_not_allowed(self):
post_1.text = post_1.text_html = text
post_1.save()
- self.manager.es_bulk_indexing_of_model(Topic)
- self.manager.es_bulk_indexing_of_model(Post)
- self.manager.refresh_index()
+ self.manager.indexing_of_model(Topic)
+ self.manager.indexing_of_model(Post)
- self.assertEqual(len(self.manager.setup_search(Search().query(MatchAll())).execute()), 2) # indexing ok
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 2) # indexing ok
# 2. search without connection and get not result
result = self.client.get(reverse("search:query") + "?q=" + text, follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 0)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 0)
# 3. Connect with user (not a member of the group), search, and get no result
self.client.force_login(self.user)
@@ -251,8 +357,8 @@ def test_hidden_forums_give_no_results_if_user_not_allowed(self):
result = self.client.get(reverse("search:query") + "?q=" + text, follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 0)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 0)
# 4. Connect with staff, search, and get the topic and the post
self.client.logout()
@@ -261,13 +367,13 @@ def test_hidden_forums_give_no_results_if_user_not_allowed(self):
result = self.client.get(reverse("search:query") + "?q=" + text, follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 2) # ok !
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 2) # ok !
def test_boosts(self):
"""Check if boosts are doing their job"""
- if not self.manager.connected_to_es:
+ if not self.manager.connected:
return
# 1. Create topics (with identical titles), posts (with identical texts), an article and a tuto
@@ -336,231 +442,293 @@ def test_boosts(self):
published_opinion_picked = PublishedContent.objects.get(content_pk=opinion_picked.pk)
- for model in self.indexable:
- if model is FakeChapter:
- continue
- self.manager.es_bulk_indexing_of_model(model)
- self.manager.refresh_index()
+ self._index_everything()
- self.assertEqual(len(self.manager.setup_search(Search().query(MatchAll())).execute()), 10)
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 10) # indexing ok
# 2. Reset all boosts to 1
for doc_type in settings.ZDS_APP["search"]["boosts"]:
for key in settings.ZDS_APP["search"]["boosts"][doc_type]:
settings.ZDS_APP["search"]["boosts"][doc_type][key] = 1.0
+ # Reindex to update the weight
+ self._index_everything()
+
# 3. Test posts
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 3)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 3)
- # score are equals without boost:
- self.assertTrue(response[0].meta.score == response[1].meta.score == response[2].meta.score)
+ # Weights are equal without boost:
+ self.assertTrue(
+ response[0]["document"]["weight"] == response[1]["document"]["weight"] == response[2]["document"]["weight"]
+ )
settings.ZDS_APP["search"]["boosts"]["post"]["if_first"] = 2.0
+ # Reindex to update the weights
+ self._index_everything()
+
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 3)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 3)
- self.assertTrue(response[0].meta.score == response[1].meta.score > response[2].meta.score)
- self.assertEqual(response[2].meta.id, str(post_2_useful.pk)) # post 2 is the only one not first
+ self.assertTrue(
+ response[0]["document"]["weight"] == response[1]["document"]["weight"] > response[2]["document"]["weight"]
+ )
+ self.assertEqual(response[2]["document"]["id"], str(post_2_useful.pk)) # post 2 is the only one not first
settings.ZDS_APP["search"]["boosts"]["post"]["if_first"] = 1.0
settings.ZDS_APP["search"]["boosts"]["post"]["if_useful"] = 2.0
+ # Reindex to update the weights
+ self._index_everything()
+
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 3)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 3)
- self.assertTrue(response[0].meta.score > response[1].meta.score == response[2].meta.score)
- self.assertEqual(response[0].meta.id, str(post_2_useful.pk)) # post 2 is useful
+ self.assertTrue(
+ response[0]["document"]["weight"] > response[1]["document"]["weight"] == response[2]["document"]["weight"]
+ )
+ self.assertEqual(response[0]["document"]["id"], str(post_2_useful.pk)) # post 2 is useful
settings.ZDS_APP["search"]["boosts"]["post"]["if_useful"] = 1.0
settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_above_1"] = 2.0
+ # Reindex to update the weight
+ self._index_everything()
+
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 3)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 3)
- self.assertTrue(response[0].meta.score > response[1].meta.score == response[2].meta.score)
- self.assertEqual(response[0].meta.id, str(post_2_useful.pk)) # post 2 have a l/d ratio of 5/2
+ self.assertTrue(
+ response[0]["document"]["weight"] == response[1]["document"]["weight"] > response[2]["document"]["weight"]
+ )
+ self.assertEqual(response[0]["document"]["id"], str(post_2_useful.pk)) # post 2 have a l/d ratio of 5/2
settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_above_1"] = 1.0
settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_below_1"] = 2.0 # no one would do that in real life
+ # Reindex to update the weight
+ self._index_everything()
+
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 3)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 3)
- self.assertTrue(response[0].meta.score > response[1].meta.score == response[2].meta.score)
- self.assertEqual(response[0].meta.id, str(post_3_ld_below_1.pk)) # post 3 have a l/d ratio of 2/5
+ self.assertTrue(
+ response[0]["document"]["weight"] > response[1]["document"]["weight"] == response[2]["document"]["weight"]
+ )
+ self.assertEqual(response[0]["document"]["id"], str(post_3_ld_below_1.pk)) # post 3 have a l/d ratio of 2/5
settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_below_1"] = 1.0
+ # Reindex to update the weight
+ self._index_everything()
+
# 4. Test topics
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Topic.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Topic.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 2)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 2)
- # score are equals without boost:
- self.assertTrue(response[0].meta.score == response[1].meta.score)
+ # Weights are equal without boost:
+ self.assertTrue(response[0]["document"]["weight"] == response[1]["document"]["weight"])
settings.ZDS_APP["search"]["boosts"]["topic"]["if_sticky"] = 2.0
+ # Reindex to update the weight
+ self._index_everything()
+
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Topic.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Topic.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 2)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 2)
- self.assertTrue(response[0].meta.score > response[1].meta.score)
- self.assertEqual(response[0].meta.id, str(topic_1_solved_sticky.pk)) # topic 1 is sticky
+ self.assertTrue(response[0]["document"]["weight"] > response[1]["document"]["weight"])
+ self.assertEqual(response[0]["document"]["id"], str(topic_1_solved_sticky.pk)) # topic 1 is sticky
settings.ZDS_APP["search"]["boosts"]["topic"]["if_sticky"] = 1.0
settings.ZDS_APP["search"]["boosts"]["topic"]["if_solved"] = 2.0
+ # Reindex to update the weight
+ self._index_everything()
+
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Topic.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Topic.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 2)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 2)
- self.assertTrue(response[0].meta.score > response[1].meta.score)
- self.assertEqual(response[0].meta.id, str(topic_1_solved_sticky.pk)) # topic 1 is solved
+ self.assertTrue(response[0]["document"]["weight"] > response[1]["document"]["weight"])
+ self.assertEqual(response[0]["document"]["id"], str(topic_1_solved_sticky.pk)) # topic 1 is solved
settings.ZDS_APP["search"]["boosts"]["topic"]["if_solved"] = 1.0
settings.ZDS_APP["search"]["boosts"]["topic"]["if_locked"] = 2.0 # no one would do that in real life
+ # Reindex to update the weight
+ self._index_everything()
+
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Topic.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Topic.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 2)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 2)
- self.assertTrue(response[0].meta.score > response[1].meta.score)
- self.assertEqual(response[0].meta.id, str(topic_2_locked.pk)) # topic 2 is locked
+ self.assertTrue(response[0]["document"]["weight"] > response[1]["document"]["weight"])
+ self.assertEqual(response[0]["document"]["id"], str(topic_2_locked.pk)) # topic 2 is locked
settings.ZDS_APP["search"]["boosts"]["topic"]["if_locked"] = 1.0 # no one would do that in real life
+ # Reindex to update the weight
+ self._index_everything()
+
# 5. Test published contents
- result = self.client.get(reverse("search:query") + "?q=" + text + "&models=content", follow=False)
+ result = self.client.get(reverse("search:query") + "?q=" + text + "&models=publishedcontent", follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 5)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 5)
- # score are equals without boost:
+ # Weights are equal without boost:
self.assertTrue(
- response[0].meta.score
- == response[1].meta.score
- == response[2].meta.score
- == response[3].meta.score
- == response[4].meta.score
+ response[0]["document"]["weight"]
+ == response[1]["document"]["weight"]
+ == response[2]["document"]["weight"]
+ == response[3]["document"]["weight"]
)
settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_article"] = 2.0
- result = self.client.get(reverse("search:query") + "?q=" + text + "&models=content", follow=False)
+ # Reindex to update the weight
+ self._index_everything()
+
+ result = self.client.get(reverse("search:query") + "?q=" + text + "&models=publishedcontent", follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 5)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 5)
- self.assertTrue(response[0].meta.score > response[1].meta.score)
- self.assertEqual(response[0].meta.id, str(published_article.pk)) # obvious
+ self.assertTrue(response[0]["document"]["weight"] > response[1]["document"]["weight"])
+ self.assertEqual(response[0]["document"]["id"], str(published_article.pk)) # obvious
settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_article"] = 1.0
- settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_tutorial"] = 2.0
+ settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_medium_or_big_tutorial"] = 2.0
- result = self.client.get(reverse("search:query") + "?q=" + text + "&models=content", follow=False)
+ # Reindex to update the weight
+ self._index_everything()
+
+ result = self.client.get(reverse("search:query") + "?q=" + text + "&models=publishedcontent", follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 5)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 5)
- self.assertTrue(response[0].meta.score > response[1].meta.score)
- self.assertEqual(response[0].meta.id, str(published_tuto.pk)) # obvious
+ self.assertTrue(response[0]["document"]["weight"] > response[1]["document"]["weight"])
+ self.assertEqual(response[0]["document"]["id"], str(published_tuto.pk)) # obvious
- settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_tutorial"] = 1.0
+ settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_medium_or_big_tutorial"] = 1.0
settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_opinion"] = 2.0
settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_opinion_not_picked"] = 4.0
# Note: in "real life", unpicked opinion would get a boost < 1.
- result = self.client.get(reverse("search:query") + "?q=" + text + "&models=content", follow=False)
+ # Reindex to update the weight
+ self._index_everything()
+
+ result = self.client.get(reverse("search:query") + "?q=" + text + "&models=publishedcontent", follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 5)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 5)
- self.assertTrue(response[0].meta.score > response[1].meta.score > response[2].meta.score)
- self.assertEqual(response[0].meta.id, str(published_opinion_not_picked.pk)) # unpicked opinion got first
- self.assertEqual(response[1].meta.id, str(published_opinion_picked.pk))
+ self.assertTrue(
+ response[0]["document"]["weight"] > response[1]["document"]["weight"] > response[2]["document"]["weight"]
+ )
+ self.assertEqual(
+ response[0]["document"]["id"], str(published_opinion_not_picked.pk)
+ ) # unpicked opinion got first
+ self.assertEqual(response[1]["document"]["id"], str(published_opinion_picked.pk))
settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_opinion"] = 1.0
settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_opinion_not_picked"] = 1.0
settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_medium_or_big_tutorial"] = 2.0
- result = self.client.get(reverse("search:query") + "?q=" + text + "&models=content", follow=False)
+ # Reindex to update the weight
+ self._index_everything()
+
+ result = self.client.get(reverse("search:query") + "?q=" + text + "&models=publishedcontent", follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 5)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 5)
- self.assertTrue(response[0].meta.score > response[1].meta.score)
- self.assertEqual(response[0].meta.id, str(published_tuto.pk)) # obvious
+ self.assertTrue(response[0]["document"]["weight"] > response[1]["document"]["weight"])
+ self.assertEqual(response[0]["document"]["id"], str(published_tuto.pk)) # obvious
settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_medium_or_big_tutorial"] = 1.0
+ # Reindex to update the weight
+ self._index_everything()
+
# 6. Test global boosts
- # NOTE: score are NOT the same for all documents, no matter how hard it tries to, small differences exists
+ # NOTE: weights are NOT the same for all documents, no matter how hard it tries to, small differences exists
for model in self.indexable:
# set a huge number to overcome the small differences:
- settings.ZDS_APP["search"]["boosts"][model.get_es_document_type()]["global"] = 10.0
+ collection = model.get_search_document_type()
+ for key in settings.ZDS_APP["search"]["boosts"][collection]:
+ settings.ZDS_APP["search"]["boosts"][collection][key] = 10.0
+
+ # Reindex to update the weight
+ self._index_everything()
result = self.client.get(reverse("search:query") + "?q=" + text, follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 10)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 10)
- self.assertEqual(response[0].meta.doc_type, model.get_es_document_type()) # obvious
+ self.assertEqual(response[0]["collection"], collection) # obvious
- settings.ZDS_APP["search"]["boosts"][model.get_es_document_type()]["global"] = 1.0
+ for key in settings.ZDS_APP["search"]["boosts"][collection]:
+ settings.ZDS_APP["search"]["boosts"][collection][key] = 1
def test_change_topic_impacts_posts(self):
- if not self.manager.connected_to_es:
+ if not self.manager.connected:
return
# 1. Create a hidden forum belonging to a hidden group and add staff in it.
@@ -578,41 +746,42 @@ def test_change_topic_impacts_posts(self):
post_1.text = post_1.text_html = text
post_1.save()
- self.manager.es_bulk_indexing_of_model(Topic)
- self.manager.es_bulk_indexing_of_model(Post)
- self.manager.refresh_index()
+ self.manager.indexing_of_model(Topic)
+ self.manager.indexing_of_model(Post)
- self.assertEqual(len(self.manager.setup_search(Search().query(MatchAll())).execute()), 2) # indexing ok
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 2) # indexing ok
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 1) # ok
- self.assertEqual(response[0].meta.doc_type, Post.get_es_document_type())
- self.assertEqual(response[0].forum_pk, self.forum.pk)
- self.assertEqual(response[0].topic_pk, topic_1.pk)
- self.assertEqual(response[0].topic_title, topic_1.title)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 1) # ok
+ self.assertEqual(response[0]["collection"], Post.get_search_document_type())
+ self.assertEqual(response[0]["document"]["forum_pk"], self.forum.pk)
+ self.assertEqual(response[0]["document"]["topic_pk"], topic_1.pk)
+ self.assertEqual(response[0]["document"]["topic_title"], topic_1.title)
# 3. Change topic title and reindex
topic_1.title = "new title"
topic_1.save()
- self.manager.es_bulk_indexing_of_model(Topic)
- self.manager.es_bulk_indexing_of_model(Post)
- self.manager.refresh_index()
+ self.manager.reset_index()
+ self.manager.indexing_of_model(Topic, force_reindexing=True, verbose=False)
+ self.manager.indexing_of_model(Post, force_reindexing=True, verbose=False)
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 1) # ok
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 1) # ok
- self.assertEqual(response[0].topic_title, topic_1.title) # title was changed
+ self.assertEqual(response[0]["document"]["topic_title"], topic_1.title) # title was changed
# 4. connect with staff and move topic
self.client.force_login(self.staff)
@@ -622,33 +791,33 @@ def test_change_topic_impacts_posts(self):
self.assertEqual(302, response.status_code)
- self.manager.es_bulk_indexing_of_model(Topic)
- self.manager.es_bulk_indexing_of_model(Post)
- self.manager.refresh_index()
+ self.manager.reset_index()
+ self.manager.indexing_of_model(Topic)
+ self.manager.indexing_of_model(Post)
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 1) # Note: without staff, would not get any results (see below)
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 1) # Note: without staff, would not get any results (see below)
- self.assertEqual(response[0].forum_pk, hidden_forum.pk) # post was updated with new forum
+ self.assertEqual(response[0]["document"]["forum_pk"], hidden_forum.pk) # post was updated with new forum
# 5. Topic is now hidden
self.client.logout()
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&models=" + Post.get_es_document_type(), follow=False
+ reverse("search:query") + "?q=" + text + "&models=" + Post.get_search_document_type(), follow=False
)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 0) # ok
+ response = result.context["object_list"]
+ self.assertEqual(len(response), 0) # ok
def test_change_publishedcontents_impacts_chapter(self):
- if not self.manager.connected_to_es:
+ if not self.manager.connected:
return
# 1. Create middle-size content and index it
@@ -675,21 +844,26 @@ def test_change_publishedcontents_impacts_chapter(self):
tuto.public_version = published
tuto.save()
- self.manager.es_bulk_indexing_of_model(PublishedContent)
- self.manager.refresh_index()
+ self.manager.indexing_of_model(PublishedContent)
- self.assertEqual(len(self.manager.setup_search(Search().query(MatchAll())).execute()), 2) # indexing ok
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 2) # indexing ok
- result = self.client.get(reverse("search:query") + "?q=" + text + "&models=content", follow=False)
+ result = self.client.get(reverse("search:query") + "?q=" + text + "&models=publishedcontent", follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
+ response = result.context["object_list"]
- self.assertEqual(response.hits.total, 2)
+ self.assertEqual(len(response), 2)
- chapters = [r for r in response if r.meta.doc_type == "chapter"]
- self.assertEqual(chapters[0].meta.doc_type, FakeChapter.get_es_document_type())
- self.assertEqual(chapters[0].meta.id, published.content_public_slug + "__" + chapter1.slug)
+ result = self.client.get(reverse("search:query") + "?q=" + text, follow=False)
+ self.assertEqual(result.status_code, 200)
+ response = result.context["object_list"]
+
+ chapters = [r for r in response if r["collection"] == "chapter"]
+ self.assertEqual(chapters[0]["collection"], FakeChapter.get_search_document_type())
+ self.assertEqual(chapters[0]["document"]["id"], published.content_public_slug + "__" + chapter1.slug)
# 2. Change tuto: delete chapter and insert new one !
tuto = PublishableContent.objects.get(pk=tuto.pk)
@@ -712,28 +886,33 @@ def test_change_publishedcontents_impacts_chapter(self):
tuto.public_version = published
tuto.save()
- self.manager.es_bulk_indexing_of_model(PublishedContent)
- self.manager.refresh_index()
+ self.manager.reset_index()
+ self.manager.indexing_of_model(PublishedContent, force_reindexing=True, verbose=False)
+ self.manager.indexing_of_model(FakeChapter)
- self.assertEqual(len(self.manager.setup_search(Search().query(MatchAll())).execute()), 2) # 2 objects, not 3 !
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 2) # indexing ok
- result = self.client.get(reverse("search:query") + "?q=" + text + "&models=content", follow=False)
+ result = self.client.get(reverse("search:query") + "?q=" + text + "&models=publishedcontent", follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
+ response = result.context["object_list"]
- contents = [r for r in response if r.meta.doc_type != "chapter"]
- self.assertEqual(response.hits.total, len(contents)) # no chapter found anymore
+ contents = [r for r in response if r["collection"] != "chapter"]
+ self.assertEqual(len(response), len(contents)) # no chapter found anymore
- result = self.client.get(reverse("search:query") + "?q=" + another_text + "&models=content", follow=False)
+ result = self.client.get(reverse("search:query") + "?q=" + another_text, follow=False)
self.assertEqual(result.status_code, 200)
- response = result.context["object_list"].execute()
- chapters = [r for r in response if r.meta.doc_type == "chapter"]
- self.assertEqual(response.hits.total, 1)
- self.assertEqual(chapters[0].meta.doc_type, FakeChapter.get_es_document_type())
- self.assertEqual(chapters[0].meta.id, published.content_public_slug + "__" + chapter2.slug) # got new chapter
+ response = result.context["object_list"]
+ chapters = [r for r in response if r["collection"] == "chapter"]
+ self.assertEqual(len(response), 1)
+ self.assertEqual(chapters[0]["collection"], FakeChapter.get_search_document_type())
+ self.assertEqual(
+ chapters[0]["document"]["id"], published.content_public_slug + "__" + chapter2.slug
+ ) # got new chapter
def test_opensearch(self):
result = self.client.get(reverse("search:opensearch"), follow=False)
@@ -746,7 +925,7 @@ def test_opensearch(self):
def test_upercase_and_lowercase_search_give_same_results(self):
"""Pretty self-explanatory function name, isn't it ?"""
- if not self.manager.connected_to_es:
+ if not self.manager.connected:
return
# 1. Index lowercase stuffs
@@ -824,135 +1003,88 @@ def test_upercase_and_lowercase_search_give_same_results(self):
tuto_uc.save()
# 3. Index and search:
- self.assertEqual(len(self.manager.setup_search(Search().query(MatchAll())).execute()), 0)
+ results = self.manager.search("*")
+ number_of_results = sum(result["found"] for result in results)
+ self.assertEqual(number_of_results, 0) # indexing ok
# index
- for model in self.indexable:
- if model is FakeChapter:
- continue
- self.manager.es_bulk_indexing_of_model(model)
- self.manager.refresh_index()
+ self._index_everything()
result = self.client.get(reverse("search:query") + "?q=" + text_lc, follow=False)
self.assertEqual(result.status_code, 200)
- response_lc = result.context["object_list"].execute()
- self.assertEqual(response_lc.hits.total, 8)
+ response_lc = result.context["object_list"]
+ self.assertEqual(len(response_lc), 8)
result = self.client.get(reverse("search:query") + "?q=" + text_uc, follow=False)
self.assertEqual(result.status_code, 200)
- response_uc = result.context["object_list"].execute()
- self.assertEqual(response_uc.hits.total, 8)
+ response_uc = result.context["object_list"]
+ self.assertEqual(len(response_uc), 8)
for responses in zip(response_lc, response_uc): # we should get results in the same order!
- self.assertEqual(responses[0].meta.id, responses[1].meta.id)
-
- def test_category_and_subcategory_impact_search(self):
- """If two contents do not belong to the same (sub)category"""
-
- if not self.manager.connected_to_es:
- return
-
- text = "Did you ever hear the tragedy of Darth Plagueis The Wise?"
-
- # 1. Create two contents with different subcategories
- category_1 = "category 1"
- subcategory_1 = SubCategoryFactory(title=category_1)
- category_2 = "category 2"
- subcategory_2 = SubCategoryFactory(title=category_2)
-
- tuto_1 = PublishableContentFactory(type="TUTORIAL")
- tuto_1_draft = tuto_1.load_version()
-
- tuto_1.title = text
- tuto_1.authors.add(self.user)
- tuto_1.subcategory.add(subcategory_1)
- tuto_1.save()
-
- tuto_1_draft.description = text
- tuto_1_draft.repo_update_top_container(text, tuto_1.slug, text, text)
-
- chapter_1 = ContainerFactory(parent=tuto_1_draft, db_object=tuto_1)
- extract_1 = ExtractFactory(container=chapter_1, db_object=tuto_1)
- extract_1.repo_update(text, text)
-
- published_1 = publish_content(tuto_1, tuto_1_draft, is_major_update=True)
-
- tuto_1.sha_public = tuto_1_draft.current_version
- tuto_1.sha_draft = tuto_1_draft.current_version
- tuto_1.public_version = published_1
- tuto_1.save()
+ self.assertEqual(responses[0]["document"]["id"], responses[1]["document"]["id"])
- tuto_2 = PublishableContentFactory(type="TUTORIAL")
- tuto_2_draft = tuto_2.load_version()
-
- tuto_2.title = text
- tuto_2.authors.add(self.user)
- tuto_2.subcategory.add(subcategory_2)
- tuto_2.save()
-
- tuto_2_draft.description = text
- tuto_2_draft.repo_update_top_container(text, tuto_2.slug, text, text)
+ def test_suggestion_content(self):
+ text = "test"
- chapter_2 = ContainerFactory(parent=tuto_2_draft, db_object=tuto_2)
- extract_2 = ExtractFactory(container=chapter_2, db_object=tuto_2)
- extract_2.repo_update(text, text)
+ publishable_article1 = PublishedContentFactory(type="ARTICLE", title=f"{text} 1")
+ published_article1 = PublishedContent.objects.get(content_pk=publishable_article1.pk)
- published_2 = publish_content(tuto_2, tuto_2_draft, is_major_update=True)
+ publishable_article2 = PublishedContentFactory(type="ARTICLE", title=f"{text} 2")
+ published_article2 = PublishedContent.objects.get(content_pk=publishable_article2.pk)
- tuto_2.sha_public = tuto_2_draft.current_version
- tuto_2.sha_draft = tuto_2_draft.current_version
- tuto_2.public_version = published_2
- tuto_2.save()
+ # Should not get a 500 if collections do not exist:
+ self.manager.clear_index()
+ result = self.client.get(reverse("search:suggestion") + "?q=foo", follow=False)
+ self.assertEqual(result.status_code, 200)
+ content = json_handler.loads(result.content.decode("utf-8"))
+ self.assertEqual(len(content["results"]), 0)
- # 2. Index:
- self.assertEqual(len(self.manager.setup_search(Search().query(MatchAll())).execute()), 0)
+ self._index_everything()
- # index
- for model in self.indexable:
- if model is FakeChapter:
- continue
- self.manager.es_bulk_indexing_of_model(model)
- self.manager.refresh_index()
-
- result = self.client.get(reverse("search:query") + "?q=" + text, follow=False)
+ # Without search term: no result
+ result = self.client.get(reverse("search:suggestion"), follow=False)
self.assertEqual(result.status_code, 200)
+ content = json_handler.loads(result.content.decode("utf-8"))
+ self.assertEqual(len(content["results"]), 0)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 4) # Ok
-
- # 3. Test
- result = self.client.get(
- reverse("search:query") + "?q=" + text + "&model=content&subcategory=" + subcategory_1.slug, follow=False
- )
+ # With empty query: no result
+ result = self.client.get(reverse("search:suggestion") + "?q=", follow=False)
+ self.assertEqual(result.status_code, 200)
+ content = json_handler.loads(result.content.decode("utf-8"))
+ self.assertEqual(len(content["results"]), 0)
+ # No result is returned when '*' is searched:
+ result = self.client.get(reverse("search:suggestion") + "?q=*", follow=False)
self.assertEqual(result.status_code, 200)
+ content = json_handler.loads(result.content.decode("utf-8"))
+ self.assertEqual(len(content["results"]), 0)
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 2)
+ # Search term with zero match:
+ result = self.client.get(reverse("search:suggestion") + "?q=foo", follow=False)
+ self.assertEqual(result.status_code, 200)
+ content = json_handler.loads(result.content.decode("utf-8"))
+ self.assertEqual(len(content["results"]), 0)
- self.assertEqual([int(r.meta.id) for r in response if r.meta.doc_type == "publishedcontent"][0], published_1.pk)
- self.assertEqual(
- [r.meta.id for r in response if r.meta.doc_type == "chapter"][0], tuto_1.slug + "__" + chapter_1.slug
- )
+ # Two matches:
+ result = self.client.get(reverse("search:suggestion") + "?q=" + text, follow=False)
+ self.assertEqual(result.status_code, 200)
+ content = json_handler.loads(result.content.decode("utf-8"))
+ self.assertEqual(len(content["results"]), 2)
+ # Two matches, but they are both excluded:
result = self.client.get(
- reverse("search:query") + "?q=" + text + "&model=content&subcategory=" + subcategory_2.slug, follow=False
+ # /!\ This route expects IDs of publish**able** contents:
+ reverse("search:suggestion") + f"?q={text}&excluded={publishable_article1.pk},{publishable_article2.pk}",
+ follow=False,
)
-
self.assertEqual(result.status_code, 200)
-
- response = result.context["object_list"].execute()
- self.assertEqual(response.hits.total, 2)
-
- self.assertEqual([int(r.meta.id) for r in response if r.meta.doc_type == "publishedcontent"][0], published_2.pk)
- self.assertEqual(
- [r.meta.id for r in response if r.meta.doc_type == "chapter"][0], tuto_2.slug + "__" + chapter_2.slug
- )
+ content = json_handler.loads(result.content.decode("utf-8"))
+ self.assertEqual(len(content["results"]), 0)
def tearDown(self):
super().tearDown()
# delete index:
- self.manager.clear_es_index()
+ self.manager.clear_index()
diff --git a/zds/searchv2/urls.py b/zds/search/urls.py
similarity index 77%
rename from zds/searchv2/urls.py
rename to zds/search/urls.py
index 25ce8e879e..35cd51090b 100644
--- a/zds/searchv2/urls.py
+++ b/zds/search/urls.py
@@ -1,5 +1,5 @@
from django.urls import path
-from zds.searchv2.views import SearchView, opensearch, SimilarTopicsView, SuggestionContentView
+from zds.search.views import SearchView, opensearch, SimilarTopicsView, SuggestionContentView
app_name = "search"
diff --git a/zds/search/utils.py b/zds/search/utils.py
new file mode 100644
index 0000000000..551165ff8a
--- /dev/null
+++ b/zds/search/utils.py
@@ -0,0 +1,362 @@
+from datetime import datetime
+from functools import lru_cache
+import logging
+import re
+import time
+
+from django.apps import apps
+from django.conf import settings
+from django.db import transaction
+
+from bs4 import BeautifulSoup
+from typesense import Client as TypesenseClient
+
+from zds.search.models import AbstractSearchIndexableModel
+
+
+def date_to_timestamp_int(date):
+ """Converts a given datetime object to Unix timestamp.
+ The purpose of this function is for indexing datetime objects in Typesense.
+
+ :param date: the datetime object to be converted
+ :type date: datetime.datetime
+
+ :return: the Unix timestamp corresponding to the given datetime object
+ :rtype: int
+ """
+ return int(datetime.timestamp(date))
+
+
+def clean_html(text):
+ """Removes all HTML tags from the given text using BeautifulSoup.
+
+ :param text: the text to be cleaned
+ :type text: str
+
+ :return: the cleaned text with all HTML tags removed
+ :rtype: str
+ """
+ result = ""
+ if text != None:
+ soup = BeautifulSoup(text, "html.parser")
+ formatted_html = soup.prettify()
+ result = re.sub(r"<[^>]*>", "", formatted_html).strip()
+ return result
+
+
+def get_all_indexable_classes(only_models=False):
+ """Return all indexable classes"""
+
+ classes = [model for model in apps.get_models() if issubclass(model, AbstractSearchIndexableModel)]
+ if not only_models:
+ # Import here instead of at the top of the file to avoid circular dependencies
+ from zds.tutorialv2.models.database import FakeChapter
+
+ classes.append(FakeChapter)
+ return classes
+
+
+@lru_cache # to mimic a singleton design pattern: we need only one SearchIndexManager instance across the application
+class SearchIndexManager:
+ """Manage interactions with the search engine"""
+
+ def __init__(self, disable_timeout=False):
+ self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+
+ self.engine = None
+ self.connected = False
+
+ if settings.SEARCH_ENABLED:
+ if disable_timeout:
+ settings.SEARCH_CONNECTION["connection_timeout_seconds"] = None
+
+ self.engine = TypesenseClient(settings.SEARCH_CONNECTION)
+
+ try:
+ self.engine.api_call.get("/health")
+ self.connected = True
+ except:
+ self.logger.warn("failed to connect to the search engine")
+
+ @property
+ def collections(self):
+ if not self.connected:
+ return []
+
+ return [c["name"] for c in self.engine.collections.retrieve()]
+
+ def clear_index(self):
+ """Remove all data and schemes from search engine and mark all objects as to be reindexed"""
+
+ if not self.connected:
+ return
+
+ for collection in self.collections:
+ self.engine.collections[collection].delete()
+
+ for model in get_all_indexable_classes(only_models=True):
+ assert issubclass(model, AbstractSearchIndexableModel)
+ objs = model.get_indexable_objects(force_reindexing=True)
+ objs.update(search_engine_requires_index=True)
+
+ def reset_index(self):
+ """Delete old collections and create new ones.
+ Then, set schemas for the different models.
+
+ :param models: list of models
+ :type models: list
+ """
+
+ if not self.connected:
+ return
+
+ self.clear_index()
+ for model in get_all_indexable_classes():
+ self.engine.collections.create(model.get_search_document_schema())
+
+ def indexing_of_model(self, model, force_reindexing=False, verbose=True):
+ """Index documents of a given model in batch, using the ``objects_per_batch`` property.
+
+ See https://typesense.org/docs/0.23.1/api/documents.html#index-multiple-documents
+
+ .. attention::
+ + Designed to work only with ``AbstractSearchIndexableModel``.
+
+ :param model: a model
+ :type model: AbstractSearchIndexableModel
+ :param force_reindexing: force all document to be indexed
+ :type force_reindexing: bool
+ :param verbose: whether to display or not the progress
+ :type verbose: bool
+ :return: the number of indexed documents
+ :rtype: int
+ """
+
+ def verbose_print(*args, **kwargs):
+ if force_reindexing and verbose:
+ print(*args, **kwargs)
+
+ def import_documents(objects, doc_type):
+ """'upsert' is the action that updates an existing document or
+ creates it, based on the 'id' field. We need this action, since
+ this function is called for initial indexing, but also to update
+ indexed records. See
+ https://typesense.org/docs/26.0/api/documents.html#index-multiple-documents
+ """
+ return self.engine.collections[doc_type].documents.import_(
+ [obj.get_document_source() for obj in objects], {"action": "upsert"}
+ )
+
+ if not self.connected:
+ return
+
+ if not issubclass(model, AbstractSearchIndexableModel):
+ return
+
+ indexed_counter = 0
+ if model.__name__ == "PublishedContent":
+ generate = model.get_indexable(force_reindexing)
+ while True:
+ with transaction.atomic():
+ try:
+ # fetch a batch (batch management is done in PublishedContent.get_indexable()):
+ objects = next(generate)
+ except StopIteration:
+ break
+
+ if not objects:
+ break
+
+ if hasattr(objects[0], "parent_id"):
+ model_to_update = objects[0].parent_model
+ pks = [o.parent_id for o in objects]
+ doc_type = "chapter"
+ else:
+ model_to_update = model
+ pks = [o.pk for o in objects]
+ doc_type = model.get_search_document_type()
+
+ answer = import_documents(objects, doc_type)
+ error = None
+ for a in answer:
+ if "success" not in a or a["success"] is not True:
+ error = a
+ break
+
+ if error is not None:
+ self.logger.warn(f"Error when indexing {doc_type} objects: {error}.")
+ else:
+ # mark all these objects as indexed at once
+ model_to_update.objects.filter(pk__in=pks).update(search_engine_requires_index=False)
+ indexed_counter += len(objects)
+ verbose_print("." * len(objects), end="", flush=True)
+ verbose_print("")
+ else:
+ objects_per_batch = getattr(model, "initial_search_index_batch_size", 1)
+ prev_obj_per_sec = None
+ last_pk = 0
+ object_source = model.get_indexable(force_reindexing)
+ doc_type = model.get_search_document_type()
+
+ while True:
+ with transaction.atomic():
+ time_start = time.time()
+
+ # fetch a batch
+ objects = list(object_source.filter(pk__gt=last_pk)[:objects_per_batch])
+
+ if not objects:
+ break
+
+ answer = import_documents(objects, doc_type)
+ error = None
+ for a in answer:
+ if "success" not in a or a["success"] is not True:
+ error = a
+ break
+
+ """The error management is not done correctly here: if a
+ batch contains only one error (for a single record), the
+ whole batch isn't marked as not requiring indexation
+ anymore and we move to the next batch.
+ However, how to handle this properly isn't straightforward:
+ what to do with bugged record: retry them? How many times?
+ Should the next batch start just after the first bugged
+ record?
+ In practice, errors do not happen, and if so, missed
+ records will be indexed during the next index_flagged.
+ """
+
+ if error is not None:
+ self.logger.warn(f"Error when indexing {doc_type} objects: {error}.")
+ else:
+ # mark all these objects as indexed at once
+ model.objects.filter(pk__in=[o.pk for o in objects]).update(search_engine_requires_index=False)
+ indexed_counter += len(objects)
+
+ # basic estimation of indexed objects per second
+ time_end = time.time()
+ last_batch_duration = time_end - time_start
+ obj_per_sec = round(float(objects_per_batch) / last_batch_duration, 2)
+ verbose_print(
+ f" {indexed_counter} so far ({obj_per_sec} obj/s, batch size: {objects_per_batch})"
+ )
+
+ if prev_obj_per_sec is not None and len(objects) == objects_per_batch:
+ # we are not in the first neither in the last batch, let see if we should adjust batch size:
+ ratio = obj_per_sec / prev_obj_per_sec
+ # if we processed this batch 20% slower/faster than the previous one, adjust batch size following exponential algorithm
+ if ratio > 1.2 or (ratio < 0.8 and objects_per_batch > 1):
+ if ratio > 1:
+ # Performance was better, increase batch size to see if we can do even better with larger batch size:
+ objects_per_batch *= 2
+ else:
+ objects_per_batch //= 2
+ verbose_print(f" {round(ratio, 2)}x, new batch size: {objects_per_batch}")
+ prev_obj_per_sec = obj_per_sec
+
+ last_pk = objects[-1].pk
+
+ return indexed_counter
+
+ def delete_document(self, document):
+ """Delete a given document
+
+ :param document: the document to delete
+ :type document: AbstractSearchIndexable
+ """
+
+ if not self.connected:
+ return
+
+ doc_type = document.get_search_document_type()
+ doc_id = document.search_engine_id
+
+ if doc_id is None or doc_type not in self.collections:
+ # This condition is here especially for tests
+ return
+
+ answer = self.engine.collections[doc_type].documents[doc_id].delete()
+ if "id" not in answer or answer["id"] != doc_id:
+ self.logger.warn(f"Error when deleting: {answer}.")
+
+ def delete_by_query(self, doc_type="", query={"filter_by": ""}):
+ """Delete a bunch of documents that match a specific filter_by condition.
+
+ See https://typesense.org/docs/0.23.1/api/documents.html#delete-by-query
+
+ .. attention ::
+ Call to this function must be done with great care!
+
+ :param doc_type: the document type
+ :type doc_type: str
+ :param query: the query to match all document to be deleted
+ :type query: search request with filter_by in the search parameters
+ """
+
+ if not self.connected:
+ return
+
+ if doc_type not in self.collections:
+ # This condition is here especially for tests
+ return
+
+ self.engine.collections[doc_type].documents.delete(query)
+
+ def search(self, request):
+ """Do a search in all collections (only used in tests)
+ :param request: a string, the search request
+ :type request: string
+ :return: formated search
+ """
+ if not self.connected:
+ return
+
+ search_requests = {"searches": []}
+ for collection in self.collections:
+ search_requests["searches"].append({"collection": collection, "q": request})
+
+ return self.engine.multi_search.perform(search_requests, None)["results"]
+
+
+class SearchFilter:
+ """Class to generate filters for Typesense queries.
+
+ See https://typesense.org/docs/26.0/api/search.html#filter-parameters
+ """
+
+ def __init__(self):
+ self.filter = ""
+
+ def __str__(self):
+ return self.filter
+
+ def _add_filter(self, f):
+ if self.filter != "":
+ self.filter += " && "
+ self.filter += f"({f})"
+
+ def add_exact_filter(self, field: str, values: list):
+ """
+ Filter documents such as field has one of the values.
+
+ :param field: Name of the field to apply the filter on.
+ :type field: str
+ :param values: A list of values the field can have.
+ :type values: list
+ """
+ self._add_filter(f"{field}:=[" + ",".join(map(str, values)) + "]")
+
+ def add_bool_filter(self, field: str, value: bool):
+ self._add_filter(f"{field}:{str(value).lower()}")
+
+ def add_not_numerical_filter(self, field: str, values: list[int]):
+ """
+ Filter documents such as field has *not* one of the values.
+
+ :param field: Name of the field to filter.
+ :type field: str
+ :param values: A list of integer values the field cannot have.
+ :type values: list[int]
+ """
+ self._add_filter(f"{field}:!=[" + ",".join(map(str, values)) + "]")
diff --git a/zds/search/views.py b/zds/search/views.py
new file mode 100644
index 0000000000..fd29c1b853
--- /dev/null
+++ b/zds/search/views.py
@@ -0,0 +1,257 @@
+from datetime import datetime
+import logging
+
+from django.conf import settings
+from django.core.exceptions import PermissionDenied
+from django.contrib import messages
+from django.http import JsonResponse
+from django.utils.translation import gettext_lazy as _
+from django.shortcuts import render
+from django.urls import reverse
+from django.views.generic.base import View
+from django.views.generic.detail import SingleObjectMixin
+
+from zds.forum.models import Topic, Post
+from zds.search.forms import SearchForm
+from zds.search.utils import SearchFilter, SearchIndexManager
+from zds.tutorialv2.models.database import FakeChapter, PublishedContent
+from zds.utils.paginator import ZdSPagingListView
+
+
+logger = logging.getLogger(__name__)
+
+
+class SimilarTopicsView(View):
+ """
+ This view allows you to suggest similar topics when creating a new topic on
+ a forum. The idea is to avoid the creation of a topic on a subject already
+ treated on the forum.
+ """
+
+ def get(self, request, *args, **kwargs):
+ results = []
+
+ search_engine_manager = SearchIndexManager()
+ if search_engine_manager.connected:
+ if "topic" in search_engine_manager.collections:
+ search_query = request.GET.get("q", "")
+
+ if search_query and "*" not in search_query:
+ max_similar_topics = settings.ZDS_APP["forum"]["max_similar_topics"]
+
+ search_parameters = {
+ "q": search_query,
+ "page": 1,
+ "per_page": max_similar_topics,
+ } | Topic.get_search_query(self.request.user)
+
+ hits = search_engine_manager.engine.collections["topic"].documents.search(search_parameters)["hits"]
+ assert len(hits) <= max_similar_topics
+
+ for hit in hits:
+ document = hit["document"]
+ result = {
+ "id": document["forum_pk"],
+ "url": str(document["get_absolute_url"]),
+ "title": str(document["title"]),
+ "subtitle": str(document["subtitle"]),
+ "forumTitle": str(document["forum_title"]),
+ "forumUrl": str(document["forum_get_absolute_url"]),
+ "pubdate": str(datetime.fromtimestamp(document["pubdate"])),
+ }
+ results.append(result)
+ else:
+ logger.warning("SimilarTopicView called, but there is no 'topic' collection.")
+
+ return JsonResponse({"results": results})
+
+
+class SuggestionContentView(View):
+ """
+ Staff members can choose at the end of a publication to suggest another
+ content of the site. When they want to add a suggestion, they write in a
+ text field the name of the content to suggest, content proposals are then
+ made, using the search engine through this view.
+ """
+
+ def get(self, request, *args, **kwargs):
+ results = []
+
+ search_engine_manager = SearchIndexManager()
+
+ if search_engine_manager.connected:
+ search_query = request.GET.get("q", "")
+
+ if "publishedcontent" in search_engine_manager.collections:
+ if search_query and "*" not in search_query:
+ max_suggestion_search_results = settings.ZDS_APP["content"]["max_suggestion_search_results"]
+
+ search_parameters = {
+ "q": search_query,
+ "page": 1,
+ "per_page": max_suggestion_search_results,
+ } | PublishedContent.get_search_query()
+
+ # We exclude contents already picked as a suggestion:
+ excluded_content_ids = request.GET.get("excluded", "")
+ if excluded_content_ids:
+ filter_by = SearchFilter()
+ filter_by.add_not_numerical_filter("content_pk", excluded_content_ids.split(","))
+ search_parameters["filter_by"] = str(filter_by)
+
+ hits = search_engine_manager.engine.collections["publishedcontent"].documents.search(
+ search_parameters
+ )["hits"]
+ assert len(hits) <= max_suggestion_search_results
+
+ for hit in hits:
+ document = hit["document"]
+ result = {
+ "id": document["content_pk"],
+ "title": str(document["title"]),
+ }
+ results.append(result)
+ else:
+ logger.warning("SuggestionContentView called, but there is no 'publishedcontent' collection.")
+
+ return JsonResponse({"results": results})
+
+
+class SearchView(ZdSPagingListView):
+ """Search view."""
+
+ template_name = "search/search.html"
+ paginate_by = settings.ZDS_APP["search"]["results_per_page"]
+
+ search_form = None
+ search_query = None
+ has_more_results = False
+
+ def get(self, request, *args, **kwargs):
+ """Overridden to catch the request and fill the form."""
+
+ if "q" in request.GET:
+ self.search_query = request.GET["q"]
+
+ self.search_form = SearchForm(data=self.request.GET)
+
+ if self.search_query and not self.search_form.is_valid():
+ raise PermissionDenied("research form is invalid")
+
+ return super().get(request, *args, **kwargs)
+
+ def get_queryset(self):
+ result = []
+
+ search_engine_manager = SearchIndexManager()
+
+ if not search_engine_manager.connected:
+ messages.warning(self.request, _("Impossible de se connecter au moteur de recherche"))
+ elif self.search_query and "*" in self.search_query:
+ # '*' is used as the search string to return all documents:
+ # https://typesense.org/docs/0.23.1/api/search.html#query-parameters
+ messages.warning(self.request, _("Les termes recherchés ne peuvent pas contenir le caractère '*'."))
+ elif self.search_query:
+ search_collections = self.search_form.cleaned_data["search_collections"]
+
+ searches = {
+ "publishedcontent": PublishedContent.get_search_query(),
+ "chapter": FakeChapter.get_search_query(),
+ "topic": Topic.get_search_query(self.request.user),
+ "post": Post.get_search_query(self.request.user),
+ }
+
+ search_requests = {"searches": []}
+ for collection in search_collections:
+ searches[collection]["collection"] = collection
+ search_requests["searches"].append(searches[collection])
+
+ """
+ Here, we reach a limitation of multicollection search of Typesense.
+ We need to search in each collection, then merge the results,
+ compute the final_score and finally sort according to the score.
+ If we want to be able to show all results, we need to loop over the
+ number of pages of search results found by Typesense, making a
+ Typesense request in each iteration. Then we need to take into
+ account which page of results the user requested. The simplest is
+ to fetch all pages from Typesense, sort all results and then send
+ the slice corresponding to the page requested by the user. However,
+ this is a considerable waste of resources: fetching many data while
+ we need only a small subset of it.
+
+ For all this reasons, we choose to set a maximum of 250 results per
+ collection. We still paginate the results presented to the user. If
+ the user has to go the last page of the 250x#collections, it
+ probably means they should refine their search query...
+
+ Having a 1:1 relationship between pages returned by Typesense and
+ pages sent to the user would probably be possible if all data we
+ can search in were in only one collection.
+ """
+
+ common_search_params = {
+ "q": self.search_query,
+ # Indicates that the last word in the query should be treated as a prefix, and not as a whole word:
+ "prefix": "false",
+ "highlight_start_tag": '',
+ "per_page": 250, # this is the maximum
+ "page": "1",
+ }
+
+ search_results = search_engine_manager.engine.multi_search.perform(search_requests, common_search_params)[
+ "results"
+ ]
+ for i in range(len(search_results)):
+ if "error" in search_results[i]:
+ logger.warning(f"Typesearch answered with an error: {search_results[i]['error']}")
+ messages.warning(
+ self.request, _(f"Le moteur de recherche a renvoyé une erreur: {search_results[i]['error']}")
+ )
+ break
+
+ if "hits" in search_results[i]:
+ for entry in search_results[i]["hits"]:
+ if "text_match" in entry:
+ entry["collection"] = search_collections[i]
+ entry["document"]["final_score"] = entry["text_match"] * entry["document"]["weight"]
+ entry["document"]["highlights"] = entry["highlights"][0]
+
+ if "tags" in entry["document"] and "tag_slugs" in entry["document"]:
+ assert len(entry["document"]["tags"]) == len(entry["document"]["tag_slugs"])
+ entry["document"]["tags"] = [
+ {"title": entry["document"]["tags"][i], "slug": entry["document"]["tag_slugs"][i]}
+ for i in range(len(entry["document"]["tags"]))
+ ]
+
+ result.append(entry)
+
+ if not self.has_more_results and search_results[i]["found"] > common_search_params["per_page"]:
+ self.has_more_results = True
+
+ result.sort(key=lambda result: result["document"]["final_score"], reverse=True)
+
+ return result
+
+ def get_context_data(self, **kwargs):
+ context = super().get_context_data(**kwargs)
+ context["form"] = self.search_form
+ context["has_query"] = self.search_query is not None
+ context["has_more_results"] = self.has_more_results
+ return context
+
+
+def opensearch(request):
+ """Generate OpenSearch Description file."""
+
+ return render(
+ request,
+ "search/opensearch.xml",
+ {
+ "site_name": settings.ZDS_APP["site"]["literal_name"],
+ "site_url": settings.ZDS_APP["site"]["url"],
+ "email_contact": settings.ZDS_APP["site"]["email_contact"],
+ "language": settings.LANGUAGE_CODE,
+ "search_url": settings.ZDS_APP["site"]["url"] + reverse("search:query"),
+ },
+ content_type="application/opensearchdescription+xml",
+ )
diff --git a/zds/searchv2/__init__.py b/zds/searchv2/__init__.py
deleted file mode 100644
index fbfe3bf3cf..0000000000
--- a/zds/searchv2/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from elasticsearch import TransportError
-from elasticsearch_dsl.connections import connections
-
-from django.conf import settings
-
-DEFAULT_ES_CONNECTIONS = {
- "default": {
- "hosts": ["localhost:9200"],
- }
-}
-
-CONNECTIONS = getattr(settings, "ES_CONNECTIONS", DEFAULT_ES_CONNECTIONS)
-ENABLED = getattr(settings, "ES_ENABLED", False)
-
-
-def setup_es_connections():
- """Create connection(s) to Elasticsearch from parameters defined in the settings.
-
- CONNECTIONS is a dict, where the keys are connection aliases and the values are parameters to the
- ``elasticsearch_dsl.connections.connection.create_connection()`` function (which are directly passed to an
- Elasticsearch object, see http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch for the options).
-
- """
-
- try:
- for alias, params in list(CONNECTIONS.items()):
- connections.create_connection(alias, **params)
- except TransportError:
- pass
-
-
-if ENABLED:
- setup_es_connections()
diff --git a/zds/searchv2/management/commands/es_manager.py b/zds/searchv2/management/commands/es_manager.py
deleted file mode 100644
index 9f47634456..0000000000
--- a/zds/searchv2/management/commands/es_manager.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from django.core.management.base import BaseCommand, CommandError
-from django.conf import settings
-
-from zds.searchv2.models import ESIndexManager, get_django_indexable_objects
-from zds.tutorialv2.models.database import FakeChapter
-
-
-class Command(BaseCommand):
- help = "Index data in ES and manage them"
-
- index_manager = None
- models = get_django_indexable_objects()
-
- def __init__(self, *args, **kwargs):
- """Overridden because FakeChapter needs to be present for mapping.
- Also, its mapping needs to be defined before the one of PublishedContent for parenting reasons (!!!).
- """
-
- super().__init__(*args, **kwargs)
- self.models.insert(0, FakeChapter)
-
- self.index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
-
- if not self.index_manager.connected_to_es:
- raise Exception("Unable to connect to Elasticsearch, aborting.")
-
- def add_arguments(self, parser):
- parser.add_argument(
- "action", type=str, help="action to perform", choices=["setup", "clear", "index_all", "index_flagged"]
- )
-
- def handle(self, *args, **options):
- if options["action"] == "setup":
- self.setup_es()
- elif options["action"] == "clear":
- self.clear_es()
- elif options["action"] == "index_all":
- self.index_documents(force_reindexing=True)
- elif options["action"] == "index_flagged":
- self.index_documents(force_reindexing=False)
- else:
- raise CommandError("unknown action {}".format(options["action"]))
-
- def setup_es(self):
- self.index_manager.reset_es_index(self.models)
- self.index_manager.setup_custom_analyzer()
-
- self.index_manager.refresh_index()
-
- def clear_es(self):
- self.index_manager.clear_es_index()
-
- for model in self.models:
- self.index_manager.clear_indexing_of_model(model)
-
- def index_documents(self, force_reindexing=False):
- if force_reindexing:
- self.setup_es() # remove all previous data
-
- for model in self.models:
- if model is FakeChapter:
- continue
-
- if force_reindexing:
- print(f"- indexing {model.get_es_document_type()}s")
-
- indexed_counter = self.index_manager.es_bulk_indexing_of_model(model, force_reindexing=force_reindexing)
- if force_reindexing:
- print(f" {indexed_counter}\titems indexed")
-
- self.index_manager.refresh_index()
diff --git a/zds/searchv2/models.py b/zds/searchv2/models.py
deleted file mode 100644
index d1e8458ab3..0000000000
--- a/zds/searchv2/models.py
+++ /dev/null
@@ -1,689 +0,0 @@
-from functools import partial
-import logging
-import time
-
-from django.apps import apps
-from django.db import models
-from django.conf import settings
-
-from elasticsearch.helpers import parallel_bulk
-from elasticsearch import ConnectionError
-from elasticsearch_dsl import Mapping
-from elasticsearch_dsl.query import MatchAll
-from elasticsearch_dsl.connections import connections
-
-from django.db import transaction
-
-
-def es_document_mapper(force_reindexing, index, obj):
- action = "update" if obj.es_already_indexed and not force_reindexing else "index"
- return obj.get_es_document_as_bulk_action(index, action)
-
-
-class AbstractESIndexable:
- """Mixin for indexable objects.
-
- Define a number of different functions that can be overridden to tune the behavior of indexing into elasticsearch.
-
- You (may) need to override :
-
- - ``get_indexable()`` ;
- - ``get_mapping()`` (not mandatory, but otherwise, ES will choose the mapping by itself) ;
- - ``get_document()`` (not mandatory, but may be useful if data differ from mapping or extra stuffs need to be done).
-
- You also need to maintain ``es_id`` and ``es_already_indexed`` for bulk indexing/updating (if any).
- """
-
- es_already_indexed = False
- es_id = ""
-
- objects_per_batch = 100
-
- @classmethod
- def get_es_document_type(cls):
- """value of the ``_type`` field in the index"""
- content_type = cls.__name__.lower()
-
- # fetch parents
- for base in cls.__bases__:
- if issubclass(base, AbstractESIndexable) and base != AbstractESDjangoIndexable:
- content_type = base.__name__.lower() + "_" + content_type
-
- return content_type
-
- @classmethod
- def get_es_mapping(self):
- """Setup mapping (data scheme).
-
- .. note::
- You will probably want to change the analyzer and boost value.
- Also consider the ``index='not_analyzed'`` option to improve performances.
-
- See https://elasticsearch-dsl.readthedocs.io/en/latest/persistence.html#mappings
-
- .. attention::
- You *may* want to override this method (otherwise ES choose the mapping by itself).
-
- :return: mapping object
- :rtype: elasticsearch_dsl.Mapping
- """
-
- es_mapping = Mapping(self.get_es_document_type())
- return es_mapping
-
- @classmethod
- def get_es_indexable(cls, force_reindexing=False):
- """Return objects to index.
-
- .. attention::
- You need to override this method (otherwise nothing will be indexed).
-
- :param force_reindexing: force to return all objects, even if they may already be indexed.
- :type force_reindexing: bool
- :rtype: list
- """
-
- return []
-
- def get_es_document_source(self, excluded_fields=None):
- """Create a document from the variable of the class, based on the mapping.
-
- .. attention::
- You may need to override this method if the data differ from the mapping for some reason.
-
- :param excluded_fields: exclude some field from the default method
- :type excluded_fields: list
- :return: document
- :rtype: dict
- """
-
- cls = self.__class__
- fields = list(cls.get_es_mapping().properties.properties.to_dict().keys())
-
- data = {}
-
- for field in fields:
- if excluded_fields and field in excluded_fields:
- data[field] = None
- continue
-
- v = getattr(self, field, None)
- if callable(v):
- v = v()
-
- data[field] = v
-
- return data
-
- def get_es_document_as_bulk_action(self, index, action="index"):
- """Create a document formatted for a ``_bulk`` operation. Formatting is done based on action.
-
- See https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html.
-
- :param index: index in witch the document will be inserted
- :type index: str
- :param action: action, either "index", "update" or "delete"
- :type action: str
- :return: the document
- :rtype: dict
- """
-
- if action not in ["index", "update", "delete"]:
- raise ValueError("action must be `index`, `update` or `delete`")
-
- document = {"_op_type": action, "_index": index, "_type": self.get_es_document_type()}
-
- if action == "index":
- if self.es_id:
- document["_id"] = self.es_id
- document["_source"] = self.get_es_document_source()
- elif action == "update":
- document["_id"] = self.es_id
- document["doc"] = self.get_es_document_source()
- elif action == "delete":
- document["_id"] = self.es_id
-
- return document
-
-
-class AbstractESDjangoIndexable(AbstractESIndexable, models.Model):
- """Version of AbstractESIndexable for a Django object, with some improvements :
-
- - Already include ``pk`` in mapping ;
- - Match ES ``_id`` field and ``pk`` ;
- - Override ``es_already_indexed`` to a database field.
- - Define a ``es_flagged`` field to restrict the number of object to be indexed ;
- - Override ``save()`` to manage the field ;
- - Define a ``get_es_django_indexable()`` method that can be overridden to change the queryset to fetch object.
- """
-
- class Meta:
- abstract = True
-
- es_flagged = models.BooleanField("Doit être (ré)indexé par ES", default=True, db_index=True)
- es_already_indexed = models.BooleanField("Déjà indexé par ES", default=False, db_index=True)
-
- def __init__(self, *args, **kwargs):
- """Override to match ES ``_id`` field and ``pk``"""
- super().__init__(*args, **kwargs)
- self.es_id = str(self.pk)
-
- @classmethod
- def get_es_mapping(cls):
- """Overridden to add pk into mapping.
-
- :return: mapping object
- :rtype: elasticsearch_dsl.Mapping
- """
-
- es_mapping = super().get_es_mapping()
- es_mapping.field("pk", "integer")
- return es_mapping
-
- @classmethod
- def get_es_django_indexable(cls, force_reindexing=False):
- """Method that can be overridden to filter django objects from database based on any criterion.
-
- :param force_reindexing: force to return all objects, even if they may be already indexed.
- :type force_reindexing: bool
- :return: query
- :rtype: django.db.models.query.QuerySet
- """
-
- query = cls.objects
-
- if not force_reindexing:
- query = query.filter(es_flagged=True)
-
- return query
-
- @classmethod
- def get_es_indexable(cls, force_reindexing=False):
- """Override ``get_es_indexable()`` in order to use the Django querysets and batch objects.
-
- :return: a queryset
- :rtype: django.db.models.query.QuerySet
- """
-
- return cls.get_es_django_indexable(force_reindexing).order_by("pk").all()
-
- def save(self, *args, **kwargs):
- """Override the ``save()`` method to flag the object if saved
- (which assumes a modification of the object, so the need to reindex).
-
- .. note::
- Flagging can be prevented using ``save(es_flagged=False)``.
- """
-
- self.es_flagged = kwargs.pop("es_flagged", True)
-
- return super().save(*args, **kwargs)
-
-
-def delete_document_in_elasticsearch(instance):
- """Delete a ESDjangoIndexable from ES database.
- Must be implemented by all classes that derive from AbstractESDjangoIndexable.
-
- :param instance: the document to delete
- :type instance: AbstractESIndexable
- """
-
- index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
-
- if index_manager.index_exists:
- index_manager.delete_document(instance)
- index_manager.refresh_index()
-
-
-def get_django_indexable_objects():
- """Return all indexable objects registered in Django"""
- return [model for model in apps.get_models() if issubclass(model, AbstractESDjangoIndexable)]
-
-
-class NeedIndex(Exception):
- """Raised when an action requires an index, but it is not created (yet)."""
-
- pass
-
-
-class ESIndexManager:
- """Manage a given index with different taylor-made functions"""
-
- def __init__(self, name, shards=5, replicas=0, connection_alias="default"):
- """Create a manager for a given index
-
- :param name: the index name
- :type name: str
- :param shards: number of shards
- :type shards: int
- :param replicas: number of replicas
- :type replicas: int
- :param connection_alias: the alias for connection
- :type connection_alias: str
- """
-
- self.index = name
- self.index_exists = False
-
- self.number_of_shards = shards
- self.number_of_replicas = replicas
-
- self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}:{self.index}")
-
- self.es = None
- self.connected_to_es = False
-
- if settings.ES_ENABLED:
- self.es = connections.get_connection(alias=connection_alias)
- self.connected_to_es = True
-
- # test connection:
- try:
- self.es.info()
- except ConnectionError:
- self.connected_to_es = False
- self.logger.warn("failed to connect to ES cluster")
- else:
- self.logger.info("connected to ES cluster")
-
- if self.connected_to_es:
- self.index_exists = self.es.indices.exists(self.index)
-
- def clear_es_index(self):
- """Clear index"""
-
- if not self.connected_to_es:
- return
-
- if self.es.indices.exists(self.index):
- self.es.indices.delete(self.index)
- self.logger.info("index cleared")
-
- self.index_exists = False
-
- def reset_es_index(self, models):
- """Delete old index and create an new one (with the same name). Setup the number of shards and replicas.
- Then, set mappings for the different models.
-
- :param models: list of models
- :type models: list
- :param number_shards: number of shards
- :type number_shards: int
- :param number_replicas: number of replicas
- :type number_replicas: int
- """
-
- if not self.connected_to_es:
- return
-
- self.clear_es_index()
-
- mappings_def = {}
-
- for model in models:
- mapping = model.get_es_mapping()
- mappings_def.update(mapping.to_dict())
-
- self.es.indices.create(
- self.index,
- body={
- "settings": {"number_of_shards": self.number_of_shards, "number_of_replicas": self.number_of_replicas},
- "mappings": mappings_def,
- },
- )
-
- self.index_exists = True
-
- self.logger.info("index created")
-
- def setup_custom_analyzer(self):
- """Override the default analyzer.
-
- See https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis.html.
-
- Our custom analyzer is based on the "french" analyzer
- (https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#french-analyzer)
- but with some difference
-
- - "custom_tokenizer", to deal with punctuation and all kind of (non-breaking) spaces, but keep dashes and
- other stuffs intact (in order to keep "c++" or "c#", for example).
- - "protect_c_language", a pattern replace filter to prevent "c" from being wiped out by the stopper.
- - "french_keywords", a keyword stopper prevent some programming language from being stemmed.
-
- .. warning::
-
- You need to run ``manage.py es_manager index_all`` if you modified this !!
- """
-
- if not self.connected_to_es:
- return
-
- if not self.index_exists:
- raise NeedIndex()
-
- self.es.indices.close(self.index)
-
- document = {
- "analysis": {
- "filter": {
- "french_elision": {
- "type": "elision",
- "articles_case": True,
- "articles": [
- "l",
- "m",
- "t",
- "qu",
- "n",
- "s",
- "j",
- "d",
- "c",
- "jusqu",
- "quoiqu",
- "lorsqu",
- "puisqu",
- ],
- },
- "protect_c_language": {"type": "pattern_replace", "pattern": "^c$", "replacement": "langage_c"},
- "french_stop": {"type": "stop", "stopwords": "_french_"},
- "french_keywords": {
- "type": "keyword_marker",
- "keywords": settings.ZDS_APP["search"]["mark_keywords"],
- },
- "french_stemmer": {"type": "stemmer", "language": "light_french"},
- },
- "tokenizer": {
- "custom_tokenizer": {
- "type": "pattern",
- "pattern": "[ .,!?%\u2026\u00AB\u00A0\u00BB\u202F\uFEFF\u2013\u2014\n]",
- }
- },
- "analyzer": {
- "default": {
- "tokenizer": "custom_tokenizer",
- "filter": [
- "lowercase",
- "protect_c_language",
- "french_elision",
- "french_stop",
- "french_keywords",
- "french_stemmer",
- ],
- "char_filter": [
- "html_strip",
- ],
- }
- },
- }
- }
-
- self.es.indices.put_settings(index=self.index, body=document)
- self.es.indices.open(self.index)
-
- self.logger.info("setup analyzer")
-
- def clear_indexing_of_model(self, model):
- """Nullify the indexing of a given model by setting ``es_already_index=False`` to all objects.
-
- Use full updating for ``AbstractESDjangoIndexable``, instead of saving all of them.
-
- :param model: the model
- :type model: class
- """
-
- if issubclass(model, AbstractESDjangoIndexable): # use a global update with Django
- objs = model.get_es_django_indexable(force_reindexing=True)
- objs.update(es_flagged=True, es_already_indexed=False)
- else:
- for objects in model.get_es_indexable(force_reindexing=True):
- for obj in objects:
- obj.es_already_indexed = False
-
- self.logger.info(f"unindex {model.get_es_document_type()}")
-
- def es_bulk_indexing_of_model(self, model, force_reindexing=False):
- """Perform a bulk action on documents of a given model. Use the ``objects_per_batch`` property to index.
-
- See http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.bulk
- and http://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.parallel_bulk
-
- .. attention::
- + Currently only implemented with "index" and "update" !
- + Currently only working with ``AbstractESDjangoIndexable``.
-
- :param model: and model
- :type model: class
- :param force_reindexing: force all document to be returned
- :type force_reindexing: bool
- :return: the number of documents indexed
- :rtype: int
- """
-
- if not self.connected_to_es:
- return
-
- if not self.index_exists:
- raise NeedIndex()
-
- # better safe than sorry
- if model.__name__ == "FakeChapter":
- self.logger.warn("Cannot index FakeChapter model. Please index its parent model.")
- return 0
-
- documents_formatter = partial(es_document_mapper, force_reindexing, self.index)
- objects_per_batch = getattr(model, "objects_per_batch", 100)
- indexed_counter = 0
- if model.__name__ == "PublishedContent":
- generate = model.get_es_indexable(force_reindexing)
- while True:
- with transaction.atomic():
- try:
- # fetch a batch
- objects = next(generate)
- except StopIteration:
- break
- if not objects:
- break
- if hasattr(objects[0], "parent_model"):
- model_to_update = objects[0].parent_model
- pks = [o.parent_id for o in objects]
- else:
- model_to_update = model
- pks = [o.pk for o in objects]
-
- formatted_documents = list(map(documents_formatter, objects))
-
- for _, hit in parallel_bulk(
- self.es, formatted_documents, chunk_size=objects_per_batch, request_timeout=30
- ):
- action = list(hit.keys())[0]
- self.logger.info("{} {} with id {}".format(action, hit[action]["_type"], hit[action]["_id"]))
-
- # mark all these objects as indexed at once
- model_to_update.objects.filter(pk__in=pks).update(es_already_indexed=True, es_flagged=False)
- indexed_counter += len(objects)
- return indexed_counter
- else:
- then = time.time()
- prev_obj_per_sec = False
- last_pk = 0
- object_source = model.get_es_indexable(force_reindexing)
-
- while True:
- with transaction.atomic():
- # fetch a batch
- objects = list(object_source.filter(pk__gt=last_pk)[:objects_per_batch])
- if not objects:
- break
-
- formatted_documents = list(map(documents_formatter, objects))
-
- for _, hit in parallel_bulk(
- self.es, formatted_documents, chunk_size=objects_per_batch, request_timeout=30
- ):
- if self.logger.getEffectiveLevel() <= logging.INFO:
- action = list(hit.keys())[0]
- self.logger.info(
- "{} {} with id {}".format(action, hit[action]["_type"], hit[action]["_id"])
- )
-
- # mark all these objects as indexed at once
- model.objects.filter(pk__in=[o.pk for o in objects]).update(
- es_already_indexed=True, es_flagged=False
- )
- indexed_counter += len(objects)
-
- # basic estimation of indexed objects per second
- now = time.time()
- last_batch_duration = int(now - then) or 1
- then = now
- obj_per_sec = round(float(objects_per_batch) / last_batch_duration, 2)
- if force_reindexing:
- print(
- " {} so far ({} obj/s, batch size: {})".format(
- indexed_counter, obj_per_sec, objects_per_batch
- )
- )
-
- if prev_obj_per_sec is False:
- prev_obj_per_sec = obj_per_sec
- else:
- ratio = obj_per_sec / prev_obj_per_sec
- # if we processed this batch 10% slower/faster than the previous one,
- # shrink/increase batch size
- if abs(1 - ratio) > 0.1:
- objects_per_batch = int(objects_per_batch * ratio)
- if force_reindexing:
- print(f" {round(ratio, 2)}x, new batch size: {objects_per_batch}")
- prev_obj_per_sec = obj_per_sec
-
- # fetch next batch
- last_pk = objects[-1].pk
-
- return indexed_counter
-
- def refresh_index(self):
- """Force the refreshing the index. The task is normally done periodically, but may be forced with this method.
-
- See https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-refresh.html.
-
- .. note::
-
- The use of this function is mandatory if you want to use the search right after an indexing.
- """
-
- if not self.connected_to_es:
- return
-
- if not self.index_exists:
- raise NeedIndex()
-
- self.es.indices.refresh(self.index)
-
- def update_single_document(self, document, doc):
- """Update given fields of a single document.
-
- See https://www.elastic.co/guide/en/elasticsearch/guide/current/partial-updates.html.
-
- :param document: the document
- :type document: AbstractESIndexable
- :param doc: fields to update
- :type doc: dict
- """
-
- if not self.connected_to_es:
- return
-
- if not self.index_exists:
- raise NeedIndex()
-
- arguments = {"index": self.index, "doc_type": document.get_es_document_type(), "id": document.es_id}
- if self.es.exists(**arguments):
- self.es.update(body={"doc": doc}, **arguments)
- self.logger.info(f"partial_update {document.get_es_document_type()} with id {document.es_id}")
-
- def delete_document(self, document):
- """Delete a given document, based on its ``es_id``
-
- :param document: the document
- :type document: AbstractESIndexable
- """
-
- if not self.connected_to_es:
- return
-
- if not self.index_exists:
- raise NeedIndex()
-
- arguments = {"index": self.index, "doc_type": document.get_es_document_type(), "id": document.es_id}
- if self.es.exists(**arguments):
- self.es.delete(**arguments)
- self.logger.info(f"delete {document.get_es_document_type()} with id {document.es_id}")
-
- def delete_by_query(self, doc_type="", query=MatchAll()):
- """Perform a deletion trough the ``_delete_by_query`` API.
-
- See https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-delete-by-query.html
-
- .. attention ::
- Call to this function must be done with great care!
-
- :param doc_type: the document type
- :type doc_type: str
- :param query: the query to match all document to be deleted
- :type query: elasticsearch_dsl.query.Query
- """
-
- if not self.connected_to_es:
- return
-
- if not self.index_exists:
- raise NeedIndex()
-
- response = self.es.delete_by_query(index=self.index, doc_type=doc_type, body={"query": query})
-
- self.logger.info("delete_by_query {}s ({})".format(doc_type, response["deleted"]))
-
- def analyze_sentence(self, request):
- """Use the anlyzer on a given sentence. Get back the list of tokens.
-
- See http://www.elastic.co/guide/en/elasticsearch/reference/current/indices-analyze.html.
-
- This is useful to perform "terms" queries instead of full-text queries.
-
- :param request: a sentence from user input
- :type request: str
- :return: the tokens
- :rtype: list
- """
-
- if not self.connected_to_es:
- return
-
- if not self.index_exists:
- raise NeedIndex()
-
- document = {"text": request}
- tokens = []
- for token in self.es.indices.analyze(index=self.index, body=document)["tokens"]:
- tokens.append(token["token"])
-
- return tokens
-
- def setup_search(self, request):
- """Setup search to the good index
-
- :param request: the search request
- :type request: elasticsearch_dsl.Search
- :return: formated search
- :rtype: elasticsearch_dsl.Search
- """
-
- if not self.connected_to_es:
- return
-
- if not self.index_exists:
- raise NeedIndex()
-
- return request.index(self.index).using(self.es)
diff --git a/zds/searchv2/tests/tests_models.py b/zds/searchv2/tests/tests_models.py
deleted file mode 100644
index 94262be4eb..0000000000
--- a/zds/searchv2/tests/tests_models.py
+++ /dev/null
@@ -1,365 +0,0 @@
-from elasticsearch_dsl import Search
-from elasticsearch_dsl.query import MatchAll
-
-from django.conf import settings
-from django.test import TestCase
-
-from zds.forum.tests.factories import TopicFactory, PostFactory, Topic, Post
-from zds.forum.tests.factories import create_category_and_forum
-from zds.member.tests.factories import ProfileFactory, StaffProfileFactory
-from zds.searchv2.models import ESIndexManager
-from zds.tutorialv2.tests.factories import PublishableContentFactory, ContainerFactory, ExtractFactory, publish_content
-from zds.tutorialv2.models.database import PublishedContent, FakeChapter, PublishableContent
-from zds.tutorialv2.tests import TutorialTestMixin, override_for_contents
-
-
-@override_for_contents(ES_ENABLED=True, ES_SEARCH_INDEX={"name": "zds_search_test", "shards": 5, "replicas": 0})
-class ESIndexManagerTests(TutorialTestMixin, TestCase):
- def setUp(self):
- settings.EMAIL_BACKEND = "django.core.mail.backends.locmem.EmailBackend"
- self.mas = ProfileFactory().user
- settings.ZDS_APP["member"]["bot_account"] = self.mas.username
-
- self.category, self.forum = create_category_and_forum()
-
- self.user = ProfileFactory().user
- self.staff = StaffProfileFactory().user
-
- self.manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
- self.indexable = [FakeChapter, PublishedContent, Topic, Post]
-
- self.manager.reset_es_index(self.indexable)
- self.manager.setup_custom_analyzer()
- self.manager.refresh_index()
-
- def test_setup_functions(self):
- """Test the behavior of the reset_es_index(), setup_custom_analyzer() and clear_es_index() functions"""
-
- if not self.manager.connected_to_es:
- return
-
- custom_index = {"name": "some_random_name", "shards": 3, "replicas": 1}
- manager = ESIndexManager(**custom_index)
-
- # in the beginning: the void:
- self.assertTrue(manager.index not in self.manager.es.cat.indices())
-
- self.assertEqual(manager.index, custom_index["name"])
- self.assertEqual(manager.number_of_shards, custom_index["shards"])
- self.assertEqual(manager.number_of_replicas, custom_index["replicas"])
-
- # 1. Creation:
- models = [Topic, Post]
- manager.reset_es_index([Topic, Post])
- self.assertTrue(manager.index in manager.es.cat.indices()) # index in !
-
- index_settings = manager.es.indices.get_settings(index=manager.index)
- self.assertTrue(manager.index in index_settings)
- index_settings = index_settings[manager.index]["settings"]["index"]
-
- self.assertEqual(index_settings["provided_name"], manager.index)
- self.assertEqual(index_settings["number_of_shards"], str(manager.number_of_shards))
- self.assertEqual(index_settings["number_of_replicas"], str(manager.number_of_replicas))
-
- # test mappings
- mappings = manager.es.indices.get_mapping(index=manager.index)
- self.assertTrue(manager.index in mappings)
- mappings = mappings[manager.index]["mappings"]
-
- for model in models:
- self.assertTrue(model.get_es_document_type() in mappings)
-
- # analyzer
- self.assertTrue("analysis" not in index_settings)
- manager.setup_custom_analyzer()
-
- index_settings = manager.es.indices.get_settings(index=manager.index)
- self.assertTrue(manager.index in index_settings)
- index_settings = index_settings[manager.index]["settings"]["index"]
- self.assertTrue("analysis" in index_settings)
-
- # 3. Clearing
- manager.clear_es_index()
- self.assertTrue(manager.index not in self.manager.es.cat.indices()) # back to the void
-
- def test_custom_analyzer(self):
- """Test our custom analyzer"""
-
- if not self.manager.connected_to_es:
- return
-
- test_sentences = [
- # stemming:
- ("programmation programmer programmateur programmes", ["program", "program", "program", "program"]),
- # keep "c" intact:
- ("apprendre à programmer en C", ["aprendr", "program", "langage_c"]),
- # remove HTML and some special characters:
- ("
« test ! », en hurlant …
", ["test", "hurlant"]),
- # keep "c++" and "linux" intact:
- ("écrire un programme en C++ avec Linux", ["ecrir", "program", "c++", "linux"]),
- # elision:
- ("c'est de l'arnaque", ["arnaqu"]),
- ]
-
- for sentence in test_sentences:
- tokens = self.manager.analyze_sentence(sentence[0])
- self.assertEqual(len(tokens), len(sentence[1]))
- self.assertEqual(tokens, sentence[1])
-
- def test_indexation(self):
- """test the indexation and deletion of the different documents"""
-
- if not self.manager.connected_to_es:
- return
-
- # create a topic with a post
- topic = TopicFactory(forum=self.forum, author=self.user)
- post = PostFactory(topic=topic, author=self.user, position=1)
-
- topic = Topic.objects.get(pk=topic.pk)
- post = Post.objects.get(pk=post.pk)
-
- self.assertFalse(topic.es_already_indexed)
- self.assertTrue(topic.es_flagged)
- self.assertFalse(post.es_already_indexed)
- self.assertTrue(post.es_flagged)
-
- # create a middle-tutorial and publish it
- tuto = PublishableContentFactory(type="TUTORIAL")
- tuto.authors.add(self.user)
- tuto.save()
-
- tuto_draft = tuto.load_version()
- chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto)
- ExtractFactory(container=chapter1, db_object=tuto)
- published = publish_content(tuto, tuto_draft, is_major_update=True)
-
- tuto.sha_public = tuto_draft.current_version
- tuto.sha_draft = tuto_draft.current_version
- tuto.public_version = published
- tuto.save()
-
- published = PublishedContent.objects.get(content_pk=tuto.pk)
- self.assertFalse(published.es_already_indexed)
- self.assertTrue(published.es_flagged)
-
- # 1. index all
- for model in self.indexable:
- if model is FakeChapter:
- continue
- self.manager.es_bulk_indexing_of_model(model, force_reindexing=False)
- self.manager.refresh_index()
-
- topic = Topic.objects.get(pk=topic.pk)
- post = Post.objects.get(pk=post.pk)
-
- self.assertTrue(topic.es_already_indexed)
- self.assertFalse(topic.es_flagged)
- self.assertTrue(post.es_already_indexed)
- self.assertFalse(post.es_flagged)
-
- published = PublishedContent.objects.get(content_pk=tuto.pk)
- self.assertTrue(published.es_already_indexed)
- self.assertFalse(published.es_flagged)
-
- s = Search()
- s.query(MatchAll())
- results = self.manager.setup_search(s).execute()
- self.assertEqual(len(results), 4) # get 4 results, one of each type
-
- must_contain = {"post": False, "topic": False, "publishedcontent": False, "chapter": False}
- id_must_be = {
- "post": str(post.pk),
- "topic": str(topic.pk),
- "publishedcontent": str(published.pk),
- "chapter": tuto.slug + "__" + chapter1.slug,
- }
-
- for hit in results:
- doc_type = hit.meta.doc_type
- must_contain[doc_type] = True
- self.assertEqual(hit.meta.id, id_must_be[doc_type])
-
- self.assertTrue(all(must_contain))
-
- # 2. Test what reindexation will do:
- new_topic = TopicFactory(forum=self.forum, author=self.user)
- new_post = PostFactory(topic=new_topic, author=self.user, position=1)
-
- pk_of_topics_to_reindex = []
- for item in Topic.get_es_indexable(force_reindexing=False):
- pk_of_topics_to_reindex.append(item.pk)
-
- pk_of_posts_to_reindex = []
- for item in Post.get_es_indexable(force_reindexing=False):
- pk_of_posts_to_reindex.append(item.pk)
-
- self.assertTrue(topic.pk not in pk_of_topics_to_reindex)
- self.assertTrue(new_topic.pk in pk_of_topics_to_reindex)
- self.assertTrue(post.pk not in pk_of_posts_to_reindex)
- self.assertTrue(new_post.pk in pk_of_posts_to_reindex)
-
- for model in self.indexable: # ok, so let's index that
- if model is FakeChapter:
- continue
- self.manager.es_bulk_indexing_of_model(model, force_reindexing=False)
- self.manager.refresh_index()
-
- s = Search()
- s.query(MatchAll())
- results = self.manager.setup_search(s).execute()
- self.assertEqual(len(results), 6) # good!
-
- # 3. Test single deletion:
- new_post = Post.objects.get(pk=new_post.pk)
-
- self.manager.delete_document(new_post)
- self.manager.refresh_index()
-
- s = Search()
- s.query(MatchAll())
- results = self.manager.setup_search(s).execute()
- self.assertEqual(len(results), 5) # one is missing
-
- for hit in results:
- self.assertTrue(hit.meta.doc_type != Post.get_es_document_type() or hit.meta.id != new_post.es_id)
-
- # 4. Test "delete_by_query_deletion":
- topic = Topic.objects.get(pk=topic.pk)
- new_topic = Topic.objects.get(pk=new_topic.pk)
-
- self.manager.delete_by_query(Topic.get_es_document_type(), MatchAll()) # the two topic are deleted
- self.manager.refresh_index()
-
- s = Search()
- s.query(MatchAll())
- results = self.manager.setup_search(s).execute()
- self.assertEqual(len(results), 3)
-
- for hit in results:
- self.assertTrue(hit.meta.doc_type != Topic.get_es_document_type() or hit.meta.id != new_topic.es_id)
- self.assertTrue(hit.meta.doc_type != Topic.get_es_document_type() or hit.meta.id != topic.es_id)
-
- # 5. Test that the deletion of an object also triggers its deletion in ES
- post = Post.objects.get(pk=post.pk)
- post.delete()
- self.manager.refresh_index()
-
- s = Search()
- s.query(MatchAll())
- results = self.manager.setup_search(s).execute()
- self.assertEqual(len(results), 2)
-
- for hit in results:
- self.assertTrue(hit.meta.doc_type != Post.get_es_document_type() or hit.meta.id != post.es_id)
-
- # 6. Test full desindexation:
- for model in self.indexable:
- if model is FakeChapter:
- continue
- self.manager.clear_indexing_of_model(model)
-
- # note "topic" is gone since "post" is gone, due to relationships at the Django level
- new_topic = Topic.objects.get(pk=new_topic.pk)
- new_post = Post.objects.get(pk=new_post.pk)
-
- self.assertFalse(new_topic.es_already_indexed)
- self.assertTrue(new_topic.es_flagged)
- self.assertFalse(new_post.es_already_indexed)
- self.assertTrue(new_post.es_flagged)
-
- published = PublishedContent.objects.get(content_pk=tuto.pk)
- self.assertFalse(published.es_already_indexed)
- self.assertTrue(published.es_flagged)
-
- def test_special_case_of_contents(self):
- """test that the old publishedcontent does not stay when a new one is created"""
-
- if not self.manager.connected_to_es:
- return
-
- # 1. Create a middle-tutorial, publish it, then index it
- tuto = PublishableContentFactory(type="TUTORIAL")
- tuto.authors.add(self.user)
- tuto.save()
-
- tuto_draft = tuto.load_version()
- chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto)
- ExtractFactory(container=chapter1, db_object=tuto)
- published = publish_content(tuto, tuto_draft, is_major_update=True)
-
- tuto.sha_public = tuto_draft.current_version
- tuto.sha_draft = tuto_draft.current_version
- tuto.public_version = published
- tuto.save()
-
- self.manager.es_bulk_indexing_of_model(PublishedContent, force_reindexing=True) # index
- self.manager.refresh_index()
-
- first_publication = PublishedContent.objects.get(content_pk=tuto.pk)
- self.assertTrue(first_publication.es_already_indexed)
- self.assertFalse(first_publication.es_flagged)
-
- s = Search()
- s.query(MatchAll())
- results = self.manager.setup_search(s).execute()
- self.assertEqual(len(results), 2) # get 2 results, one for the content and one for the chapter
-
- self.assertEqual(PublishedContent.objects.count(), 1)
-
- # 2. Change thet title, which will trigger a change in the slug
- tuto = PublishableContent.objects.get(pk=tuto.pk)
- versioned = tuto.load_version(sha=tuto.sha_draft)
-
- tuto.title = "un titre complètement différent!"
- tuto.save(force_slug_update=True)
-
- versioned.repo_update_top_container(tuto.title, tuto.slug, "osef", "osef")
- second_publication = publish_content(tuto, versioned, True)
-
- tuto.sha_public = versioned.current_version
- tuto.sha_draft = versioned.current_version
- tuto.public_version = second_publication
- tuto.save()
-
- self.assertEqual(PublishedContent.objects.count(), 2) # now there is two objects ...
- first_publication = PublishedContent.objects.get(pk=first_publication.pk)
- self.assertTrue(first_publication.must_redirect) # .. including the first one, for redirection
-
- self.manager.refresh_index()
-
- s = Search()
- s.query(MatchAll())
- results = self.manager.setup_search(s).execute()
- self.assertEqual(len(results), 0) # the old one is gone (and we need to reindex to get the new one)
-
- # 3. Check if indexation brings the new one, and not the old one
- self.manager.es_bulk_indexing_of_model(PublishedContent, force_reindexing=True) # index
- self.manager.refresh_index()
-
- first_publication = PublishedContent.objects.get(pk=first_publication.pk)
- second_publication = PublishedContent.objects.get(pk=second_publication.pk)
-
- s = Search()
- s.query(MatchAll())
- results = self.manager.setup_search(s).execute()
- self.assertEqual(len(results), 2) # Still 2, not 4 !
-
- found_old = False
- found_new = False
-
- for hit in results:
- if hit.meta.doc_type == PublishedContent.get_es_document_type():
- if hit.meta.id == first_publication.es_id:
- found_old = True
- if hit.meta.id == second_publication.es_id:
- found_new = True
-
- self.assertTrue(found_new)
- self.assertFalse(found_old)
-
- def tearDown(self):
- super().tearDown()
-
- # delete index:
- self.manager.clear_es_index()
diff --git a/zds/searchv2/tests/tests_utils.py b/zds/searchv2/tests/tests_utils.py
deleted file mode 100644
index 2a99f3e14a..0000000000
--- a/zds/searchv2/tests/tests_utils.py
+++ /dev/null
@@ -1,177 +0,0 @@
-from elasticsearch_dsl import Search
-from elasticsearch_dsl.query import MatchAll
-
-from django.conf import settings
-from django.test import TestCase
-from django.core.management import call_command
-
-from zds.member.tests.factories import ProfileFactory, StaffProfileFactory
-from zds.tutorialv2.tests.factories import PublishableContentFactory, ContainerFactory, ExtractFactory
-from zds.tutorialv2.models.database import PublishedContent
-from zds.tutorialv2.publication_utils import publish_content
-from zds.forum.tests.factories import TopicFactory, PostFactory, Topic, Post
-from zds.forum.tests.factories import create_category_and_forum
-from zds.searchv2.models import ESIndexManager
-from zds.tutorialv2.tests import TutorialTestMixin, override_for_contents
-
-
-@override_for_contents(ES_ENABLED=True, ES_SEARCH_INDEX={"name": "zds_search_test", "shards": 5, "replicas": 0})
-class UtilsTests(TutorialTestMixin, TestCase):
- def setUp(self):
- settings.EMAIL_BACKEND = "django.core.mail.backends.locmem.EmailBackend"
- self.mas = ProfileFactory().user
- settings.ZDS_APP["member"]["bot_account"] = self.mas.username
-
- self.category, self.forum = create_category_and_forum()
-
- self.user = ProfileFactory().user
- self.staff = StaffProfileFactory().user
-
- self.index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
-
- def test_es_manager(self):
- """Test the behavior of the ``es_manager`` command"""
-
- if not self.index_manager.connected_to_es:
- return
-
- # in the beginning: the void
- self.assertTrue(self.index_manager.index not in self.index_manager.es.cat.indices())
-
- text = "Ceci est un texte de test"
-
- # create a topic with a post
- topic = TopicFactory(forum=self.forum, author=self.user, title=text)
- post = PostFactory(topic=topic, author=self.user, position=1)
- post.text = post.text_html = text
- post.save()
-
- topic = Topic.objects.get(pk=topic.pk)
- post = Post.objects.get(pk=post.pk)
-
- self.assertFalse(topic.es_already_indexed)
- self.assertTrue(topic.es_flagged)
- self.assertFalse(post.es_already_indexed)
- self.assertTrue(post.es_flagged)
-
- # create a middle-tutorial and publish it
- tuto = PublishableContentFactory(type="TUTORIAL")
- tuto.authors.add(self.user)
- tuto.save()
-
- tuto_draft = tuto.load_version()
- chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto)
- chapter1.repo_update(text, text, text)
- extract1 = ExtractFactory(container=chapter1, db_object=tuto)
- version = extract1.repo_update(text, text)
- published = publish_content(tuto, tuto_draft, is_major_update=True)
-
- tuto.sha_public = version
- tuto.sha_draft = version
- tuto.public_version = published
- tuto.save()
-
- published = PublishedContent.objects.get(content_pk=tuto.pk)
- self.assertFalse(published.es_already_indexed)
- self.assertTrue(published.es_flagged)
-
- # 1. test "index-all"
- call_command("es_manager", "index_all")
- self.assertTrue(self.index_manager.es.indices.exists(self.index_manager.index))
- self.index_manager.index_exists = True
-
- topic = Topic.objects.get(pk=topic.pk)
- post = Post.objects.get(pk=post.pk)
-
- self.assertTrue(topic.es_already_indexed)
- self.assertFalse(topic.es_flagged)
- self.assertTrue(post.es_already_indexed)
- self.assertFalse(post.es_flagged)
-
- published = PublishedContent.objects.get(content_pk=tuto.pk)
- self.assertTrue(published.es_already_indexed)
- self.assertFalse(published.es_flagged)
-
- s = Search()
- s.query(MatchAll())
- results = self.index_manager.setup_search(s).execute()
- self.assertEqual(len(results), 4) # get 4 results, one of each type
-
- must_contain = {"post": False, "topic": False, "publishedcontent": False, "chapter": False}
- id_must_be = {
- "post": str(post.pk),
- "topic": str(topic.pk),
- "publishedcontent": str(published.pk),
- "chapter": tuto.slug + "__" + chapter1.slug,
- }
-
- for hit in results:
- doc_type = hit.meta.doc_type
- must_contain[doc_type] = True
- self.assertEqual(hit.meta.id, id_must_be[doc_type])
-
- self.assertTrue(all(must_contain))
-
- # 2. test "clear"
- self.assertTrue(self.index_manager.index in self.index_manager.es.cat.indices()) # index in
-
- call_command("es_manager", "clear")
- self.assertFalse(self.index_manager.es.indices.exists(self.index_manager.index))
- self.index_manager.index_exists = False
-
- # must reset every object
- topic = Topic.objects.get(pk=topic.pk)
- post = Post.objects.get(pk=post.pk)
-
- self.assertFalse(topic.es_already_indexed)
- self.assertTrue(topic.es_flagged)
- self.assertFalse(post.es_already_indexed)
- self.assertTrue(post.es_flagged)
-
- published = PublishedContent.objects.get(content_pk=tuto.pk)
- self.assertFalse(published.es_already_indexed)
- self.assertTrue(published.es_flagged)
-
- self.assertTrue(self.index_manager.index not in self.index_manager.es.cat.indices()) # index wiped out !
-
- # 3. test "setup"
- call_command("es_manager", "setup")
- self.assertTrue(self.index_manager.es.indices.exists(self.index_manager.index))
- self.index_manager.index_exists = True
-
- self.assertTrue(self.index_manager.index in self.index_manager.es.cat.indices()) # index back in ...
-
- s = Search()
- s.query(MatchAll())
- results = self.index_manager.setup_search(s).execute()
- self.assertEqual(len(results), 0) # ... but with nothing in it
-
- result = self.index_manager.es.indices.get_settings(index=self.index_manager.index)
- settings_index = result[self.index_manager.index]["settings"]["index"]
- self.assertTrue("analysis" in settings_index) # custom analyzer was setup
-
- # 4. test "index-flagged" once ...
- call_command("es_manager", "index_flagged")
-
- topic = Topic.objects.get(pk=topic.pk)
- post = Post.objects.get(pk=post.pk)
-
- self.assertTrue(topic.es_already_indexed)
- self.assertFalse(topic.es_flagged)
- self.assertTrue(post.es_already_indexed)
- self.assertFalse(post.es_flagged)
-
- published = PublishedContent.objects.get(content_pk=tuto.pk)
- self.assertTrue(published.es_already_indexed)
- self.assertFalse(published.es_flagged)
-
- s = Search()
- s.query(MatchAll())
- results = self.index_manager.setup_search(s).execute()
- self.assertEqual(len(results), 4) # get the 4 results back
-
- def tearDown(self):
- super().tearDown()
-
- # delete index:
- self.index_manager.clear_es_index()
diff --git a/zds/searchv2/views.py b/zds/searchv2/views.py
deleted file mode 100644
index 73beed8d19..0000000000
--- a/zds/searchv2/views.py
+++ /dev/null
@@ -1,371 +0,0 @@
-from zds import json_handler
-import operator
-
-from elasticsearch_dsl import Search
-from elasticsearch_dsl.query import Match, MultiMatch, FunctionScore, Term, Terms, Range
-
-from django.conf import settings
-from django.core.exceptions import PermissionDenied
-from django.contrib import messages
-from django.http import HttpResponse
-from django.utils.translation import gettext_lazy as _
-from django.shortcuts import render
-from django.urls import reverse
-from django.views.generic import CreateView
-from django.views.generic.detail import SingleObjectMixin
-
-from zds.searchv2.forms import SearchForm
-from zds.searchv2.models import ESIndexManager
-from zds.utils.paginator import ZdSPagingListView
-from zds.forum.utils import get_authorized_forums_pk
-from functools import reduce
-
-
-class SimilarTopicsView(CreateView, SingleObjectMixin):
- search_query = None
- authorized_forums = ""
- index_manager = None
-
- def __init__(self, **kwargs):
- """Overridden because the index manager must NOT be initialized elsewhere."""
-
- super().__init__(**kwargs)
- self.index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
-
- def get(self, request, *args, **kwargs):
- if "q" in request.GET:
- self.search_query = "".join(request.GET["q"])
-
- results = []
- if self.index_manager.connected_to_es and self.search_query:
- self.authorized_forums = get_authorized_forums_pk(self.request.user)
-
- search_queryset = Search()
- query = (
- Match(_type="topic")
- & Terms(forum_pk=self.authorized_forums)
- & MultiMatch(query=self.search_query, fields=["title", "subtitle", "tags"])
- )
-
- functions_score = [
- {"filter": Match(is_solved=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_solved"]},
- {"filter": Match(is_sticky=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_sticky"]},
- {"filter": Match(is_locked=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_locked"]},
- ]
-
- scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score)
- search_queryset = search_queryset.query(scored_query)[:10]
-
- # Build the result
- for hit in search_queryset.execute():
- result = {
- "id": hit.pk,
- "url": str(hit.get_absolute_url),
- "title": str(hit.title),
- "subtitle": str(hit.subtitle),
- "forumTitle": str(hit.forum_title),
- "forumUrl": str(hit.forum_get_absolute_url),
- "pubdate": str(hit.pubdate),
- }
- results.append(result)
-
- data = {"results": results}
- return HttpResponse(json_handler.dumps(data), content_type="application/json")
-
-
-class SuggestionContentView(CreateView, SingleObjectMixin):
- search_query = None
- authorized_forums = ""
- index_manager = None
-
- def __init__(self, **kwargs):
- """Overridden because the index manager must NOT be initialized elsewhere."""
-
- super().__init__(**kwargs)
- self.index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
-
- def get(self, request, *args, **kwargs):
- if "q" in request.GET:
- self.search_query = "".join(request.GET["q"])
- excluded_content_ids = request.GET.get("excluded", "").split(",")
- results = []
- if self.index_manager.connected_to_es and self.search_query:
- self.authorized_forums = get_authorized_forums_pk(self.request.user)
-
- search_queryset = Search()
- if len(excluded_content_ids) > 0 and excluded_content_ids != [""]:
- search_queryset = search_queryset.exclude("terms", content_pk=excluded_content_ids)
- query = Match(_type="publishedcontent") & MultiMatch(
- query=self.search_query, fields=["title", "description"]
- )
-
- functions_score = [
- {
- "filter": Match(content_type="TUTORIAL"),
- "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_tutorial"],
- },
- {
- "filter": Match(content_type="ARTICLE"),
- "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_article"],
- },
- {
- "filter": Match(content_type="OPINION"),
- "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_opinion"],
- },
- ]
-
- scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score)
- search_queryset = search_queryset.query(scored_query)[:10]
-
- # Build the result
- for hit in search_queryset.execute():
- result = {
- "id": hit.content_pk,
- "pubdate": hit.publication_date,
- "title": str(hit.title),
- "description": str(hit.description),
- }
- results.append(result)
-
- data = {"results": results}
-
- return HttpResponse(json_handler.dumps(data), content_type="application/json")
-
-
-class SearchView(ZdSPagingListView):
- """Search view."""
-
- template_name = "searchv2/search.html"
- paginate_by = settings.ZDS_APP["search"]["results_per_page"]
-
- search_form_class = SearchForm
- search_form = None
- search_query = None
- content_category = None
- content_subcategory = None
-
- authorized_forums = ""
-
- index_manager = None
-
- def __init__(self, **kwargs):
- """Overridden because the index manager must NOT be initialized elsewhere."""
-
- super().__init__(**kwargs)
- self.index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
-
- def get(self, request, *args, **kwargs):
- """Overridden to catch the request and fill the form."""
-
- if "q" in request.GET:
- self.search_query = "".join(request.GET["q"])
-
- self.search_form = self.search_form_class(data=self.request.GET)
-
- if self.search_query and not self.search_form.is_valid():
- raise PermissionDenied("research form is invalid")
-
- return super().get(request, *args, **kwargs)
-
- def get_queryset(self):
- if not self.index_manager.connected_to_es:
- messages.warning(self.request, _("Impossible de se connecter à Elasticsearch"))
- return []
-
- if self.search_query:
- # Searches forums the user is allowed to visit
- self.authorized_forums = get_authorized_forums_pk(self.request.user)
-
- search_queryset = Search()
-
- # Restrict (sub)category if any
- if self.search_form.cleaned_data["category"]:
- self.content_category = self.search_form.cleaned_data["category"]
- if self.search_form.cleaned_data["subcategory"]:
- self.content_subcategory = self.search_form.cleaned_data["subcategory"]
-
- # Mark that contents must come from library if required
- self.from_library = False
- if self.search_form.cleaned_data["from_library"] == "on":
- self.from_library = True
-
- # Setting the different querysets (according to the selected models, if any)
- part_querysets = []
- chosen_groups = self.search_form.cleaned_data["models"]
-
- if chosen_groups:
- models = []
- for group in chosen_groups:
- if group in settings.ZDS_APP["search"]["search_groups"]:
- models.append(settings.ZDS_APP["search"]["search_groups"][group][1])
- else:
- models = [v[1] for k, v in settings.ZDS_APP["search"]["search_groups"].items()]
-
- models = reduce(operator.concat, models)
-
- for model in models:
- part_querysets.append(getattr(self, f"get_queryset_{model}s")())
-
- queryset = part_querysets[0]
- for query in part_querysets[1:]:
- queryset |= query
-
- # Weighting:
- weight_functions = []
- for _type, weights in list(settings.ZDS_APP["search"]["boosts"].items()):
- if _type in models:
- weight_functions.append({"filter": Match(_type=_type), "weight": weights["global"]})
-
- scored_queryset = FunctionScore(query=queryset, boost_mode="multiply", functions=weight_functions)
- search_queryset = search_queryset.query(scored_queryset)
-
- # Highlighting:
- search_queryset = search_queryset.highlight_options(
- fragment_size=150, number_of_fragments=5, pre_tags=["[hl]"], post_tags=["[/hl]"]
- )
- search_queryset = search_queryset.highlight("text").highlight("text_html")
-
- # Executing:
- return self.index_manager.setup_search(search_queryset)
-
- return []
-
- def get_queryset_publishedcontents(self):
- """Search in PublishedContent objects."""
-
- query = Match(_type="publishedcontent") & MultiMatch(
- query=self.search_query, fields=["title", "description", "categories", "subcategories", "tags", "text"]
- )
-
- if self.from_library:
- query &= Match(content_type="TUTORIAL") | Match(content_type="ARTICLE")
-
- if self.content_category:
- query &= Match(categories=self.content_category)
-
- if self.content_subcategory:
- query &= Match(subcategories=self.content_subcategory)
-
- functions_score = [
- {
- "filter": Match(content_type="TUTORIAL"),
- "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_tutorial"],
- },
- {
- "filter": Match(content_type="TUTORIAL") & Match(has_chapters=True),
- "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_medium_or_big_tutorial"],
- },
- {
- "filter": Match(content_type="ARTICLE"),
- "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_article"],
- },
- {
- "filter": Match(content_type="OPINION"),
- "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_opinion"],
- },
- {
- "filter": Match(content_type="OPINION") & Match(picked=False),
- "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_opinion_not_picked"],
- },
- ]
-
- scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score)
-
- return scored_query
-
- def get_queryset_chapters(self):
- """Search in content chapters."""
-
- query = Match(_type="chapter") & MultiMatch(query=self.search_query, fields=["title", "text"])
-
- if self.content_category:
- query &= Match(categories=self.content_category)
-
- if self.content_subcategory:
- query &= Match(subcategories=self.content_subcategory)
-
- return query
-
- def get_queryset_topics(self):
- """Search in topics, and remove the result if the forum is not allowed for the user.
-
- Score is modified if:
-
- + topic is solved;
- + topic is sticky;
- + topic is locked.
- """
-
- query = (
- Match(_type="topic")
- & Terms(forum_pk=self.authorized_forums)
- & MultiMatch(query=self.search_query, fields=["title", "subtitle", "tags"])
- )
-
- functions_score = [
- {"filter": Match(is_solved=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_solved"]},
- {"filter": Match(is_sticky=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_sticky"]},
- {"filter": Match(is_locked=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_locked"]},
- ]
-
- scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score)
-
- return scored_query
-
- def get_queryset_posts(self):
- """Search in posts, and remove result if the forum is not allowed for the user or if the message is invisible.
-
- Score is modified if:
-
- + post is the first one in a topic;
- + post is marked as "useful";
- + post has a like/dislike ratio above (has more likes than dislikes) or below (the other way around) 1.0.
- """
-
- query = (
- Match(_type="post")
- & Terms(forum_pk=self.authorized_forums)
- & Term(is_visible=True)
- & MultiMatch(query=self.search_query, fields=["text_html"])
- )
-
- functions_score = [
- {"filter": Match(position=1), "weight": settings.ZDS_APP["search"]["boosts"]["post"]["if_first"]},
- {"filter": Match(is_useful=True), "weight": settings.ZDS_APP["search"]["boosts"]["post"]["if_useful"]},
- {
- "filter": Range(like_dislike_ratio={"gt": 1}),
- "weight": settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_above_1"],
- },
- {
- "filter": Range(like_dislike_ratio={"lt": 1}),
- "weight": settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_below_1"],
- },
- ]
-
- scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score)
-
- return scored_query
-
- def get_context_data(self, **kwargs):
- context = super().get_context_data(**kwargs)
- context["form"] = self.search_form
- context["query"] = self.search_query is not None
-
- return context
-
-
-def opensearch(request):
- """Generate OpenSearch Description file."""
-
- return render(
- request,
- "searchv2/opensearch.xml",
- {
- "site_name": settings.ZDS_APP["site"]["literal_name"],
- "site_url": settings.ZDS_APP["site"]["url"],
- "email_contact": settings.ZDS_APP["site"]["email_contact"],
- "language": settings.LANGUAGE_CODE,
- "search_url": settings.ZDS_APP["site"]["url"] + reverse("search:query"),
- },
- content_type="application/opensearchdescription+xml",
- )
diff --git a/zds/settings/abstract_base/django.py b/zds/settings/abstract_base/django.py
index ece097f26e..d18d6416af 100644
--- a/zds/settings/abstract_base/django.py
+++ b/zds/settings/abstract_base/django.py
@@ -178,7 +178,7 @@
"zds.tutorialv2",
"zds.member",
"zds.featured",
- "zds.searchv2",
+ "zds.search",
"zds.notification",
# Uncomment the next line to enable the admin:
"django.contrib.admin",
diff --git a/zds/settings/abstract_base/zds.py b/zds/settings/abstract_base/zds.py
index 40991b2f8f..29ad9724d9 100644
--- a/zds/settings/abstract_base/zds.py
+++ b/zds/settings/abstract_base/zds.py
@@ -12,18 +12,18 @@
GEOIP_PATH = str(BASE_DIR / "geodata")
GEOIP_CITY = "GeoLite2-City.mmdb"
-ES_ENABLED = True
+SEARCH_ENABLED = True
-ES_CONNECTIONS = {
- "default": {
- "hosts": ["localhost:9200"],
- }
-}
-
-ES_SEARCH_INDEX = {
- "name": "zds_search",
- "shards": 3,
- "replicas": 0,
+SEARCH_CONNECTION = {
+ "nodes": [
+ {
+ "host": "localhost",
+ "port": "8108",
+ "protocol": "http",
+ }
+ ],
+ "api_key": "xyz",
+ "connection_timeout_seconds": 5,
}
# Anonymous [Dis]Likes. Authors of [dis]likes before those pk will never be shown
@@ -51,6 +51,11 @@
DEFAULT_ASSO_LINK = "https://www.helloasso.com/associations/zeste-de-savoir/adhesions/zeste-de-savoir-cotisations-2018"
+global_weight_publishedcontent = 2
+global_weight_topic = 2
+global_weight_chapter = 1.5
+global_weight_post = 0.8
+
ZDS_APP = {
"site": {
"name": "ZesteDeSavoir",
@@ -177,6 +182,7 @@
"helps_per_page": 20,
"commits_per_page": 20,
"suggestions_per_page": 2,
+ "max_suggestion_search_results": 10,
"mass_edit_goals_content_per_page": 25,
"view_contents_by_goal_content_per_page": 42,
"view_contents_by_label_content_per_page": 42,
@@ -210,6 +216,7 @@
"top_tag_exclu": ["bug", "suggestion", "tutoriel", "beta", "article"],
"greetings": ["salut", "bonjour", "yo ", "hello", "bon matin", "tout le monde se secoue"],
"description_size": 120,
+ "max_similar_topics": 10,
},
"topic": {
"home_number": 5,
@@ -227,37 +234,57 @@
},
"paginator": {"folding_limit": 4},
"search": {
- "mark_keywords": ["javafx", "haskell", "groovy", "powershell", "latex", "linux", "windows"],
"results_per_page": 20,
"search_groups": {
- "content": (_("Contenus publiés"), ["publishedcontent", "chapter"]),
+ "publishedcontent": (_("Contenus publiés"), ["publishedcontent", "chapter"]),
"topic": (_("Sujets du forum"), ["topic"]),
"post": (_("Messages du forum"), ["post"]),
},
+ "search_content_type": {
+ "tutorial": (_("Tutoriels"), ["tutorial"]),
+ "article": (_("Articles"), ["article"]),
+ "opinion": (_("Billet"), ["opinion"]),
+ },
+ "search_validated_content": {
+ "validated": (_("Contenus validés"), ["validated"]),
+ "no_validated": (_("Contenus libres"), ["no_validated"]),
+ },
"boosts": {
"publishedcontent": {
- "global": 3.0,
- "if_article": 1.0,
- "if_tutorial": 1.0,
- "if_medium_or_big_tutorial": 1.5,
- "if_opinion": 0.66,
- "if_opinion_not_picked": 0.5,
- },
- "topic": {
- "global": 2.0,
- "if_solved": 1.1,
- "if_sticky": 1.2,
- "if_locked": 0.1,
+ "global": global_weight_publishedcontent,
+ "if_article": global_weight_publishedcontent * 1.5,
+ "if_tutorial": global_weight_publishedcontent * 1.5,
+ "if_medium_or_big_tutorial": global_weight_publishedcontent * 1.7,
+ "if_opinion": global_weight_publishedcontent * 1.3,
+ "if_opinion_not_picked": global_weight_publishedcontent * 1.1,
+ "title": global_weight_publishedcontent * 4,
+ "description": global_weight_publishedcontent * 2,
+ "categories": global_weight_publishedcontent * 1,
+ "subcategories": global_weight_publishedcontent * 1,
+ "tags": global_weight_publishedcontent * 1,
+ "text": global_weight_publishedcontent * 2,
},
"chapter": {
- "global": 1.5,
+ "global": global_weight_chapter,
+ "title": global_weight_chapter * 3,
+ "text": global_weight_chapter * 2,
+ },
+ "topic": {
+ "global": global_weight_topic,
+ "if_solved": global_weight_topic * 1.1,
+ "if_sticky": global_weight_topic * 1.2,
+ "if_locked": global_weight_topic * 0.1,
+ "title": global_weight_topic * 3,
+ "subtitle": global_weight_topic * 2,
+ "tags": global_weight_topic * 1,
},
"post": {
- "global": 1.0,
- "if_first": 1.2,
- "if_useful": 1.5,
- "ld_ratio_above_1": 1.05,
- "ld_ratio_below_1": 0.95,
+ "global": global_weight_post,
+ "if_first": global_weight_post * 1.5,
+ "if_useful": global_weight_post * 1.2,
+ "ld_ratio_above_1": global_weight_post * 1.05,
+ "ld_ratio_below_1": global_weight_post * 0.95,
+ "text": global_weight_post,
},
},
},
diff --git a/zds/settings/prod.py b/zds/settings/prod.py
index a108aa5087..8453c5c287 100644
--- a/zds/settings/prod.py
+++ b/zds/settings/prod.py
@@ -124,9 +124,7 @@ def _get_version():
###############################################################################
# ZESTE DE SAVOIR SETTINGS
-
-ES_SEARCH_INDEX["shards"] = config["elasticsearch"].get("shards", 3)
-
+SEARCH_CONNECTION["api_key"] = config["typesense"].get("api_key", "xyz")
ZDS_APP["site"]["association"]["email"] = "communication@zestedesavoir.com"
diff --git a/zds/tutorialv2/migrations/0040_rename_search_fields.py b/zds/tutorialv2/migrations/0040_rename_search_fields.py
new file mode 100644
index 0000000000..f28979232f
--- /dev/null
+++ b/zds/tutorialv2/migrations/0040_rename_search_fields.py
@@ -0,0 +1,28 @@
+# Generated by Django 4.2.11 on 2024-04-15 22:53
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("tutorialv2", "0039_publishedcontent_typo"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="publishedcontent",
+ name="es_already_indexed",
+ ),
+ migrations.RemoveField(
+ model_name="publishedcontent",
+ name="es_flagged",
+ ),
+ migrations.AddField(
+ model_name="publishedcontent",
+ name="search_engine_requires_index",
+ field=models.BooleanField(
+ db_index=True, default=True, verbose_name="Doit être (ré)indexé par le moteur de recherche"
+ ),
+ ),
+ ]
diff --git a/zds/tutorialv2/migrations/0041_remove_must_reindex.py b/zds/tutorialv2/migrations/0041_remove_must_reindex.py
new file mode 100644
index 0000000000..c71bf33eec
--- /dev/null
+++ b/zds/tutorialv2/migrations/0041_remove_must_reindex.py
@@ -0,0 +1,17 @@
+# Generated by Django 4.2.11 on 2024-06-24 22:02
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("tutorialv2", "0040_rename_search_fields"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="publishablecontent",
+ name="must_reindex",
+ ),
+ ]
diff --git a/zds/tutorialv2/models/database.py b/zds/tutorialv2/models/database.py
index 08a483a43c..8f0a50ddb2 100644
--- a/zds/tutorialv2/models/database.py
+++ b/zds/tutorialv2/models/database.py
@@ -10,14 +10,12 @@
from django.contrib.auth.models import User
from django.db import models
from django.db.models import CASCADE
-from django.db.models.signals import pre_delete, post_delete, pre_save
+from django.db.models.signals import pre_delete, post_delete, pre_save, post_save
from django.dispatch import receiver
from django.http import Http404
from django.urls import reverse
from django.utils.http import urlencode
from django.utils.translation import gettext_lazy as _
-from elasticsearch_dsl import Mapping, Q as ES_Q
-from elasticsearch_dsl.field import Text, Keyword, Date, Boolean
from git import Repo, BadObject
from gitdb.exc import BadName
@@ -26,11 +24,15 @@
from zds.gallery.models import Image, Gallery, UserGallery, GALLERY_WRITE
from zds.member.utils import get_external_account
from zds.mp.models import PrivateTopic
-from zds.searchv2.models import (
- AbstractESDjangoIndexable,
- AbstractESIndexable,
- delete_document_in_elasticsearch,
- ESIndexManager,
+from zds.search.models import (
+ AbstractSearchIndexableModel,
+ AbstractSearchIndexable,
+)
+from zds.search.utils import (
+ SearchFilter,
+ SearchIndexManager,
+ date_to_timestamp_int,
+ clean_html,
)
from zds.tutorialv2.managers import PublishedContentManager, PublishableContentManager, ReactionManager
from zds.tutorialv2.models import (
@@ -46,7 +48,7 @@
from zds.tutorialv2.models.versioned import NotAPublicVersion, VersionedContent
from zds.tutorialv2.utils import get_content_from_json, BadManifestError, get_blob
from zds.utils import get_current_user
-from zds.utils.models import SubCategory, Licence, Comment, Tag
+from zds.utils.models import Category, SubCategory, Licence, Comment, Tag
from zds.tutorialv2.models.help_requests import HelpWriting
from zds.utils.templatetags.emarkdown import render_markdown_stats
from zds.utils.uuslug_wrapper import uuslug
@@ -148,8 +150,6 @@ class Meta:
is_locked = models.BooleanField("Est verrouillé", default=False)
js_support = models.BooleanField("Support du Javascript", default=False)
- must_reindex = models.BooleanField("Si le contenu doit-être ré-indexé", default=True)
-
is_obsolete = models.BooleanField("Est obsolète", default=False)
public_version = models.ForeignKey(
@@ -194,6 +194,13 @@ def save(self, *args, force_slug_update=False, update_date=True, **kwargs):
self.slug = uuslug(self.title, instance=self, max_length=80)
if update_date:
self.update_date = datetime.now()
+ if self.public_version:
+ # This will probably triggers more reindexing than actually
+ # required (for instance, when updating an attribute that is not
+ # indexed), but it's definitely simpler than tracking which
+ # attributes are changed.
+ self.public_version.search_engine_requires_index = True
+ self.public_version.save()
super().save(*args, **kwargs)
def get_absolute_url_beta(self):
@@ -654,7 +661,30 @@ def delete_gallery(sender, instance, **kwargs):
instance.gallery.delete()
-class PublishedContent(AbstractESDjangoIndexable, TemplatableContentModelMixin, OnlineLinkableContentMixin):
+@receiver(post_save, sender=Tag)
+def content_tags_changed(instance, created, **kwargs):
+ if not created:
+ # It is an update of an existing object
+ PublishedContent.objects.filter(content__tags=instance.pk).update(search_engine_requires_index=True)
+
+
+@receiver(post_save, sender=SubCategory)
+def content_subcategories_changed(instance, created, **kwargs):
+ if not created:
+ # It is an update of an existing object
+ PublishedContent.objects.filter(content__subcategory=instance.pk).update(search_engine_requires_index=True)
+
+
+@receiver(post_save, sender=Category)
+def content_categories_changed(instance, created, **kwargs):
+ if not created:
+ # It is an update of an existing object
+ PublishedContent.objects.filter(content__subcategory__categorysubcategory__category=instance.pk).update(
+ search_engine_requires_index=True
+ )
+
+
+class PublishedContent(AbstractSearchIndexableModel, TemplatableContentModelMixin, OnlineLinkableContentMixin):
"""A class that contains information on the published version of a content.
Used for quick url resolution, quick listing, and to know where the public version of the files are.
@@ -997,34 +1027,32 @@ def last_publication_date(self):
return max(self.publication_date, self.update_date or datetime.min)
@classmethod
- def get_es_mapping(cls):
- mapping = Mapping(cls.get_es_document_type())
-
- mapping.field("content_pk", "integer")
- mapping.field("publication_date", Date())
- mapping.field("content_type", Keyword())
-
- # not from PublishedContent directly:
- mapping.field("title", Text(boost=1.5))
- mapping.field("description", Text(boost=1.5))
- mapping.field("tags", Text(boost=2.0))
- mapping.field("categories", Keyword(boost=1.5))
- mapping.field("subcategories", Keyword(boost=1.5))
- mapping.field("text", Text()) # for article and mini-tuto, text is directly included into the main object
- mapping.field("has_chapters", Boolean()) # ... otherwise, it is written
- mapping.field("picked", Boolean())
-
- # not indexed:
- mapping.field("get_absolute_url_online", Keyword(index=False))
- mapping.field("thumbnail", Keyword(index=False))
+ def get_search_document_schema(cls):
+ search_engine_schema = super().get_search_document_schema()
+
+ search_engine_schema["fields"] = [
+ {"name": "title", "type": "string", "facet": False}, # we search on it
+ {"name": "content_pk", "type": "int32", "facet": False}, # we filter on it
+ {"name": "content_type", "type": "string", "index": False},
+ {"name": "publication_date", "type": "int64", "index": False},
+ {"name": "tags", "type": "string[]", "facet": True, "optional": True}, # we search on it
+ {"name": "tag_slugs", "type": "string[]", "index": False, "optional": True},
+ {"name": "subcategories", "type": "string[]", "facet": True, "optional": True}, # we search on it
+ {"name": "categories", "type": "string[]", "facet": True, "optional": True}, # we search on it
+ {"name": "text", "type": "string", "facet": False, "optional": True}, # we search on it
+ {"name": "description", "type": "string", "facet": False, "optional": True}, # we search on it
+ {"name": "get_absolute_url_online", "type": "string", "index": False},
+ {"name": "thumbnail", "type": "string", "index": False, "optional": True},
+ {"name": "weight", "type": "float"}, # we sort on it
+ ]
- return mapping
+ return search_engine_schema
@classmethod
- def get_es_django_indexable(cls, force_reindexing=False):
+ def get_indexable_objects(cls, force_reindexing=False):
"""Overridden to remove must_redirect=True (and prefetch stuffs)."""
- q = super().get_es_django_indexable(force_reindexing)
+ q = super().get_indexable_objects(force_reindexing)
return (
q.prefetch_related("content")
.prefetch_related("content__tags")
@@ -1034,14 +1062,14 @@ def get_es_django_indexable(cls, force_reindexing=False):
)
@classmethod
- def get_es_indexable(cls, force_reindexing=False):
+ def get_indexable(cls, force_reindexing=False):
"""Overridden to also include chapters"""
- index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
+ search_engine_manager = SearchIndexManager()
# fetch initial batch
last_pk = 0
- objects_source = super().get_es_indexable(force_reindexing)
+ objects_source = super().get_indexable(force_reindexing)
objects = list(objects_source.filter(pk__gt=last_pk)[: PublishedContent.objects_per_batch])
while objects:
@@ -1052,15 +1080,12 @@ def get_es_indexable(cls, force_reindexing=False):
# chapters are only indexed for middle and big tuto
if versioned.has_sub_containers():
- # delete possible previous chapters
- if content.es_already_indexed:
- index_manager.delete_by_query(
- FakeChapter.get_es_document_type(), ES_Q("match", _routing=content.es_id)
- )
-
+ # delete previous chapters already indexed
+ if not content.search_engine_requires_index:
+ FakeChapter.remove_from_search_engine(search_engine_manager, content.search_engine_id)
# (re)index the new one(s)
for chapter in versioned.get_list_of_chapters():
- chapters.append(FakeChapter(chapter, versioned, content.es_id))
+ chapters.append(FakeChapter(chapter, versioned, content.search_engine_id))
if chapters:
# since we want to return at most PublishedContent.objects_per_batch items
@@ -1075,68 +1100,101 @@ def get_es_indexable(cls, force_reindexing=False):
last_pk = objects[-1].pk
objects = list(objects_source.filter(pk__gt=last_pk)[: PublishedContent.objects_per_batch])
- def get_es_document_source(self, excluded_fields=None):
+ def get_document_source(self, excluded_fields=[]):
"""Overridden to handle the fact that most information are versioned"""
- excluded_fields = excluded_fields or []
- excluded_fields.extend(["title", "description", "tags", "categories", "text", "thumbnail", "picked"])
+ excluded_fields.extend(["title", "description", "tags", "categories", "text", "thumbnail", "publication_date"])
- data = super().get_es_document_source(excluded_fields=excluded_fields)
+ data = super().get_document_source(excluded_fields=excluded_fields)
# fetch versioned information
versioned = self.load_public_version()
data["title"] = versioned.title
data["description"] = versioned.description
- data["tags"] = [tag.title for tag in versioned.tags.all()]
+ data["publication_date"] = date_to_timestamp_int(self.publication_date)
+
+ data["tags"] = []
+ data["tag_slugs"] = []
+ for tag in versioned.tags.all():
+ data["tags"].append(tag.title)
+ data["tag_slugs"].append(tag.slug) # store also slugs to have them from search results
if self.content.image:
data["thumbnail"] = self.content.image.physical["content_thumb"].url
- categories = []
- subcategories = []
+ data["categories"] = []
+ data["subcategories"] = []
for subcategory in versioned.subcategory.all():
parent_category = subcategory.get_parent_category()
- if subcategory.slug not in subcategories:
- subcategories.append(subcategory.slug)
- if parent_category and parent_category.slug not in categories:
- categories.append(parent_category.slug)
-
- data["categories"] = categories
- data["subcategories"] = subcategories
+ if subcategory.slug not in data["subcategories"]:
+ data["subcategories"].append(subcategory.slug)
+ if parent_category and parent_category.slug not in data["categories"]:
+ data["categories"].append(parent_category.slug)
if versioned.has_extracts():
- data["text"] = versioned.get_content_online()
- data["has_chapters"] = False
- else:
- data["has_chapters"] = True
+ data["text"] = clean_html(versioned.get_content_online())
- data["picked"] = False
-
- if self.content_type == "OPINION" and self.content.sha_picked is not None:
- data["picked"] = True
+ is_medium_big_tutorial = versioned.has_sub_containers()
+ data["weight"] = self._compute_search_weight(is_medium_big_tutorial)
return data
+ def _compute_search_weight(self, is_medium_big_tutorial: bool):
+ """
+ This function calculates a weight for publishedcontent in order to sort them according to different boosts.
+ There is a boost according to the type of content (article, opinion, tutorial),
+ if it is a big tutorial or if it is picked.
+ """
+ weights = settings.ZDS_APP["search"]["boosts"]["publishedcontent"]
+
+ if self.content_type == "ARTICLE":
+ return weights["if_article"]
+ elif self.content_type == "TUTORIAL":
+ if is_medium_big_tutorial:
+ return weights["if_medium_or_big_tutorial"]
+ else:
+ return weights["if_tutorial"]
+ else:
+ assert self.content_type == "OPINION"
+ if self.content.sha_picked is not None:
+ return weights["if_opinion"]
+ else:
+ return weights["if_opinion_not_picked"]
+
+ @classmethod
+ def get_search_query(cls):
+ return {
+ "query_by": "title,description,categories,subcategories,tags,text",
+ "query_by_weights": "{},{},{},{},{},{}".format(
+ settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["title"],
+ settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["description"],
+ settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["categories"],
+ settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["subcategories"],
+ settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["tags"],
+ settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["text"],
+ ),
+ "sort_by": "weight:desc",
+ }
+
@receiver(pre_delete, sender=PublishedContent)
-def delete_published_content_in_elasticsearch(sender, instance, **kwargs):
- """Catch the pre_delete signal to ensure the deletion in ES. Also, handle the deletion of the corresponding
- chapters.
+def delete_published_content_in_search_engine(sender, instance, **kwargs):
+ """Catch the pre_delete signal to ensure the deletion in the search engine.
+ Also, handle the deletion of the corresponding chapters.
"""
- index_manager = ESIndexManager(**settings.ES_SEARCH_INDEX)
-
- if index_manager.index_exists:
- index_manager.delete_by_query(FakeChapter.get_es_document_type(), ES_Q("match", _routing=instance.es_id))
+ search_engine_manager = SearchIndexManager()
- return delete_document_in_elasticsearch(instance)
+ FakeChapter.remove_from_search_engine(search_engine_manager, instance.search_engine_id)
+ search_engine_manager.delete_document(instance)
@receiver(pre_save, sender=PublishedContent)
-def delete_published_content_in_elasticsearch_if_set_to_redirect(sender, instance, **kwargs):
- """If the slug of the content changes, the ``must_redirect`` field is set to ``True`` and a new
- PublishedContnent is created. To avoid duplicates, the previous ones must be removed from ES.
+def delete_published_content_in_search_engine_if_set_to_redirect(sender, instance, **kwargs):
+ """If the slug of the content changes, the ``must_redirect`` field is set
+ to ``True`` and a new PublishedContnent is created. To avoid duplicates,
+ the previous ones must be removed from the search engine.
"""
try:
@@ -1145,15 +1203,15 @@ def delete_published_content_in_elasticsearch_if_set_to_redirect(sender, instanc
pass # nothing to worry about
else:
if not obj.must_redirect and instance.must_redirect:
- delete_published_content_in_elasticsearch(sender, instance, **kwargs)
+ delete_published_content_in_search_engine(sender, instance, **kwargs)
-class FakeChapter(AbstractESIndexable):
- """A simple class that is used by ES to index chapters, constructed from the containers.
+class FakeChapter(AbstractSearchIndexable):
+ """A simple class that is used by Typesense to index chapters, constructed from the containers.
- In mapping, this class defines PublishedContent as its parent. Also, indexing is done by the parent.
+ In schema, this class defines PublishedContent as its parent. Also, indexing is done by the parent.
- Note that this class is only indexable, not updatable, since it does not maintain value of ``es_already_indexed``
+ Note that this class is only indexable, not updatable, since it cannot maintain a value of ``search_engine_requires_index``.
"""
parent_model = PublishedContent
@@ -1174,7 +1232,9 @@ def __init__(self, chapter, main_container, parent_id):
self.parent_id = parent_id
self.get_absolute_url_online = chapter.get_absolute_url_online()
- self.es_id = main_container.slug + "__" + chapter.slug # both slugs are unique by design, so id remains unique
+ self.search_engine_id = (
+ main_container.slug + "__" + chapter.slug
+ ) # both slugs are unique by design, so id remains unique
self.parent_title = main_container.title
self.parent_get_absolute_url_online = main_container.get_absolute_url_online()
@@ -1193,36 +1253,56 @@ def __init__(self, chapter, main_container, parent_id):
self.categories.append(parent_category.slug)
@classmethod
- def get_es_document_type(cls):
+ def get_search_document_type(cls):
return "chapter"
@classmethod
- def get_es_mapping(self):
- """Define mapping and parenting"""
+ def get_search_document_schema(self):
+ search_engine_schema = super().get_search_document_schema()
+
+ search_engine_schema["fields"] = [
+ {"name": "parent_id", "type": "string", "facet": False}, # we filter on it when content is removed
+ {"name": "title", "type": "string", "facet": False}, # we search on it
+ {"name": "parent_title", "type": "string", "index": False},
+ {"name": "parent_publication_date", "type": "int64", "index": False},
+ {"name": "text", "type": "string", "facet": False}, # we search on it
+ {"name": "get_absolute_url_online", "type": "string", "index": False},
+ {"name": "parent_get_absolute_url_online", "type": "string", "index": False},
+ {"name": "thumbnail", "type": "string", "index": False},
+ {"name": "weight", "type": "float", "facet": False}, # we sort on it
+ ]
- mapping = Mapping(self.get_es_document_type())
- mapping.meta("parent", type="publishedcontent")
+ return search_engine_schema
- mapping.field("title", Text(boost=1.5))
- mapping.field("text", Text())
- mapping.field("categories", Keyword(boost=1.5))
- mapping.field("subcategories", Keyword(boost=1.5))
+ def get_document_source(self, excluded_fields=[]):
+ """Overridden to handle the fact that most information are versioned"""
- # not indexed:
- mapping.field("get_absolute_url_online", Keyword(index=False))
- mapping.field("parent_title", Text(index=False))
- mapping.field("parent_get_absolute_url_online", Keyword(index=False))
- mapping.field("parent_publication_date", Date(index=False))
- mapping.field("thumbnail", Keyword(index=False))
+ excluded_fields.extend(["text"])
- return mapping
+ data = super().get_document_source(excluded_fields=excluded_fields)
+ data["parent_publication_date"] = date_to_timestamp_int(self.parent_publication_date)
+ data["weight"] = settings.ZDS_APP["search"]["boosts"]["chapter"]["global"]
+ data["text"] = clean_html(self.text)
- def get_es_document_as_bulk_action(self, index, action="index"):
- """Overridden to handle parenting between chapter and PublishedContent"""
+ return data
+
+ @classmethod
+ def get_search_query(cls):
+ return {
+ "query_by": "title,text",
+ "query_by_weights": "{},{}".format(
+ settings.ZDS_APP["search"]["boosts"]["chapter"]["title"],
+ settings.ZDS_APP["search"]["boosts"]["chapter"]["text"],
+ ),
+ "sort_by": "weight:desc",
+ }
+
+ @classmethod
+ def remove_from_search_engine(cls, search_engine_manager: SearchIndexManager, parent_search_engine_id: int):
+ filter_by = SearchFilter()
+ filter_by.add_exact_filter("parent_id", parent_search_engine_id)
- document = super().get_es_document_as_bulk_action(index, action)
- document["_parent"] = self.parent_id
- return document
+ search_engine_manager.delete_by_query(cls.get_search_document_type(), {"filter_by": str(filter_by)})
class ContentReaction(Comment):
diff --git a/zds/tutorialv2/publication_utils.py b/zds/tutorialv2/publication_utils.py
index 48e5efc387..59fd7a5a47 100644
--- a/zds/tutorialv2/publication_utils.py
+++ b/zds/tutorialv2/publication_utils.py
@@ -106,7 +106,6 @@ def publish_content(db_object, versioned, is_major_update=True):
public_version.content_type = versioned.type
public_version.content_pk = db_object.pk
public_version.content = db_object
- public_version.must_reindex = True
public_version.char_count = char_count
public_version.save()
with contextlib.suppress(FileExistsError):
diff --git a/zds/tutorialv2/tests/__init__.py b/zds/tutorialv2/tests/__init__.py
index 157afab775..af18f67a13 100644
--- a/zds/tutorialv2/tests/__init__.py
+++ b/zds/tutorialv2/tests/__init__.py
@@ -21,8 +21,8 @@ class override_for_contents(override_settings):
def __init__(self, **kwargs):
kwargs.update(MEDIA_ROOT=settings.BASE_DIR / "media-test", ZDS_APP=overridden_zds_app)
- if "ES_ENABLED" not in kwargs:
- kwargs.update(ES_ENABLED=False)
+ if "SEARCH_ENABLED" not in kwargs:
+ kwargs.update(SEARCH_ENABLED=False)
super().__init__(**kwargs)
diff --git a/zds/tutorialv2/tests/tests_front.py b/zds/tutorialv2/tests/tests_front.py
index 5d8eb5931f..d3322afd39 100644
--- a/zds/tutorialv2/tests/tests_front.py
+++ b/zds/tutorialv2/tests/tests_front.py
@@ -30,7 +30,7 @@
@override_settings(MEDIA_ROOT=settings.BASE_DIR / "media-test")
@override_settings(ZDS_APP=overridden_zds_app)
-@override_settings(ES_ENABLED=False)
+@override_settings(SEARCH_ENABLED=False)
@tag("front")
class PublicationFronttest(StaticLiveServerTestCase, TutorialTestMixin, TutorialFrontMixin):
@classmethod
diff --git a/zds/tutorialv2/tests/tests_views/tests_published.py b/zds/tutorialv2/tests/tests_views/tests_published.py
index a09c5af73f..5277aa8fdd 100644
--- a/zds/tutorialv2/tests/tests_views/tests_published.py
+++ b/zds/tutorialv2/tests/tests_views/tests_published.py
@@ -46,7 +46,7 @@
@override_settings(MEDIA_ROOT=settings.BASE_DIR / "media-test")
@override_settings(ZDS_APP=overridden_zds_app)
-@override_settings(ES_ENABLED=False)
+@override_settings(SEARCH_ENABLED=False)
class PublishedContentTests(TutorialTestMixin, TestCase):
def setUp(self):
self.overridden_zds_app = overridden_zds_app
diff --git a/zds/tutorialv2/tests/tests_views/tests_stats.py b/zds/tutorialv2/tests/tests_views/tests_stats.py
index b12c613ffc..30b973ce07 100644
--- a/zds/tutorialv2/tests/tests_views/tests_stats.py
+++ b/zds/tutorialv2/tests/tests_views/tests_stats.py
@@ -30,7 +30,7 @@ def daterange(start_date, end_date):
@override_settings(MEDIA_ROOT=settings.BASE_DIR / "media-test")
@override_settings(ZDS_APP=overridden_zds_app)
-@override_settings(ES_ENABLED=False)
+@override_settings(SEARCH_ENABLED=False)
class StatTests(TestCase, TutorialTestMixin):
def setUp(self):
self.nb_part = 1
diff --git a/zds/tutorialv2/views/validations_opinions.py b/zds/tutorialv2/views/validations_opinions.py
index 3eb7409c99..6cad0b931f 100644
--- a/zds/tutorialv2/views/validations_opinions.py
+++ b/zds/tutorialv2/views/validations_opinions.py
@@ -308,7 +308,7 @@ def form_valid(self, form):
db_object.save()
# mark to reindex to boost correctly in the search
- self.public_content_object.es_flagged = True
+ self.public_content_object.search_engine_requires_index = True
self.public_content_object.save()
PickListOperation.objects.create(
content=self.object,
@@ -385,7 +385,7 @@ def form_valid(self, form):
self.request.user
)
# mark to reindex to boost correctly in the search
- self.public_content_object.es_flagged = True
+ self.public_content_object.search_engine_requires_index = True
self.public_content_object.save()
msg = render_to_string(
diff --git a/zds/urls.py b/zds/urls.py
index 4e514144ba..02c7846b26 100644
--- a/zds/urls.py
+++ b/zds/urls.py
@@ -88,7 +88,7 @@ def location(self, item):
re_path(r"^admin/", admin.site.urls),
path("pages/", include("zds.pages.urls")),
path("galerie/", include("zds.gallery.urls")),
- path("rechercher/", include("zds.searchv2.urls")),
+ path("rechercher/", include("zds.search.urls")),
path("munin/", include(("zds.munin.urls", "munin"), namespace="munin")),
path("mise-en-avant/", include("zds.featured.urls")),
path("notifications/", include("zds.notification.urls")),
diff --git a/zds/utils/templatetags/date.py b/zds/utils/templatetags/date.py
index 91759df4ce..a14b34dc9c 100644
--- a/zds/utils/templatetags/date.py
+++ b/zds/utils/templatetags/date.py
@@ -89,12 +89,3 @@ def date_from_timestamp(timestamp):
"""Convert a timestamp (number of second from epoch) to a datetime object,
another filter should then be used to format the datetime object."""
return datetime.fromtimestamp(timestamp)
-
-
-@register.filter
-def from_elasticsearch_date(value):
- try:
- date = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%f")
- except ValueError:
- date = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S")
- return date
diff --git a/zds/utils/templatetags/elasticsearch.py b/zds/utils/templatetags/elasticsearch.py
deleted file mode 100644
index cf8da0257d..0000000000
--- a/zds/utils/templatetags/elasticsearch.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import re
-
-from django import template
-
-register = template.Library()
-
-html_tag = re.compile(r"<.*?>")
-
-
-def format_highlight(highlighted_fragments):
- """Strip HTML, then transform back into html with highlighted fragments only.
-
- :param highlighted_fragments: list of fragments from elasticsearch
- :type highlighted_fragments: list
- :return: the formatted string
- :rtype: str
- """
-
- fragments = []
- for fragment in highlighted_fragments:
- if fragment:
- fragments.append(
- html_tag.sub("", fragment).replace("[hl]", '').replace("[/hl]", "")
- )
-
- return " … ".join(fragments)
-
-
-class HighlightNode(template.Node):
- """For a elasticsearch result, looks into ``.meta.highlight`` if something has been highlighted. If so, use that
- information. Otherwise, just give back the text.
-
- See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-highlighting.html
-
- Note that the class expects ``"pre_tags" : ["[hl]"], "post_tags" : ["[/hl]"]``, since all HTML is stripped.
- """
-
- def __init__(self, search_result, field):
- self.search_result = search_result
- self.field = field
-
- def render(self, context):
- search_result = context[self.search_result]
-
- if self.field[0] in ['"', "'"]:
- field = self.field[1:-1]
- else:
- field = template.Variable(self.field).resolve(context)
-
- if field not in search_result:
- raise template.VariableDoesNotExist(f"field {field} is not a member of the search result")
-
- text = ""
-
- if search_result[field]:
- text = html_tag.sub("", search_result[field])
-
- if "highlight" in search_result.meta:
- if field in search_result.meta.highlight:
- text = format_highlight(search_result.meta.highlight[field])
-
- return text
-
-
-@register.tag
-def highlight(parser, token):
- part = token.split_contents()
-
- if len(part) != 3:
- raise template.TemplateSyntaxError(
- "'highlight' tag must be of the form: {% highlight %}"
- )
-
- return HighlightNode(part[1], part[2])