From 4281c594a721c1e92cc75abe03e417ef22fc3338 Mon Sep 17 00:00:00 2001
From: Lucaterre <ls.terriel@gmail.com>
Date: Wed, 5 Oct 2022 09:53:21 +0200
Subject: [PATCH] Catch Attribute exception in updated_entity + update doc

---
 CONTRIBUTE.md => CONTRIBUTING.md      |  0
 README.md                             | 21 ++----
 examples/process_long_text.py         | 97 ---------------------------
 spacyfishing/entity_fishing_linker.py | 42 ++++++------
 4 files changed, 30 insertions(+), 130 deletions(-)
 rename CONTRIBUTE.md => CONTRIBUTING.md (100%)
 delete mode 100644 examples/process_long_text.py

diff --git a/CONTRIBUTE.md b/CONTRIBUTING.md
similarity index 100%
rename from CONTRIBUTE.md
rename to CONTRIBUTING.md
diff --git a/README.md b/README.md
index e4754d1..5cb54aa 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,6 @@ This extension allows using entity-fishing tool as a spaCy pipeline component to
     - [Get extra information from Wikidata](#Get-extra-information-from-Wikidata)
     - [Use other language](#Use-other-language)
     - [Get information about entity fishing API response](#Get-information-about-entity-fishing-API-response)
-    - [How to process a long text?](#How-to-process-a-long-text?)
  * [Configuration parameters](#Configuration-parameters)
  * [Attributes](#Attributes)
  * [Recommendations](#Recommendations)
@@ -496,18 +495,6 @@ doc._.metadata
 }
 ```
 
-### How to process a long text?
-
-Process NER and disambiguate a long text can be really tricky.
-In fact, spaCy can be raised an exception due to the default limit parameter `nlp.max_length`.
-The strategy here is to pass a text as batch of sentences with [`nlp.pipe()`](https://spacy.io/api/language#pipe) method and,
-then pass entities to spacyfishing with all context (not only the sentences, to help disambiguation) and
-all entities with continuous characters offsets (start and end characters positions are re-calculated).
-You can use a provided script [`process_long_text.py`](examples/process_long_text.py) that can help to process huge text.
-For example, a text with `2 073` sentences that contains `12 901` entities to disambiguate can be processed in about a minute (with no extra information)
-and in less than 1 minute 30 (with extra information and properties filter applied).
-
-
 ## Configuration parameters
 
 ```
@@ -618,4 +605,10 @@ Entity-fishing is tool created by [Patrice Lopez](https://github.com/kermitt2) (
 
 Awesome logo designed by [Alix Chagué](https://github.com/alix-tz).
 
-Special thanks to [@HugoSchtr](https://github.com/HugoSchtr), [@gromag](https://github.com/gromag) for documentation review.
+Special thanks to 
+
+- Documentation review:
+[@HugoSchtr](https://github.com/HugoSchtr), [@gromag](https://github.com/gromag) 
+
+- Code contribution:
+[@davidberenstein1957](https://github.com/davidberenstein1957)
diff --git a/examples/process_long_text.py b/examples/process_long_text.py
deleted file mode 100644
index 03362e3..0000000
--- a/examples/process_long_text.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# -*- coding: UTF-8 -*-
-#!/usr/bin/env python3
-
-
-"""
-process_long_text.py
-
-This script is a customizable example that allows you to work on large texts.
-
-Process:
-
-    1. Create a custom preprocess text function
-    (sentences tokenization and clean processes)
-    2. Create a pipeline with two components:
-    - custom_ner: an intermediate pipeline apply ner with batch
-     method Language.pipe() and recompute entities offsets in context of all text;
-    - spacy_fishing: apply disambiguation and linking on all preprocess text (to keep more context as possible);
-    3. Apply complete pipeline on text and retrieve results.
-"""
-
-import time
-
-import spacy
-from spacy import Language
-from spacy.tokens import Doc
-
-
-def open_file(file_name: str) -> str:
-    with open(file_name, mode="r", encoding="utf-8") as f:
-        return f.read()
-
-# use the tokenizer and apply the cleaning functions 
-# of your choice 
-def text_preprocessor(text: str) -> list:
-    return [sentence.strip() for sentence in text.split("\n") if sentence != ""]
-
-
-@Language.factory("custom_ner", default_config={
-    "model_name": "",
-    "sentences_to_process": []
-})
-class CustomNer:
-    def __init__(self,
-                 nlp: Language,
-                 name: str,
-                 model_name: str,
-                 sentences_to_process: list):
-        self.nlp = nlp
-        self.pipeline_ner = spacy.load(model_name, disable=["tok2vec", "morphologizer", "parser", "senter", "attribute_ruler", "lemmatizer"])
-        self.sentences = sentences_to_process
-
-    def __call__(self, doc: Doc):
-        start_sentence = 0
-        spans = []
-        for sent in self.pipeline_ner.pipe(self.sentences):
-            # add 1 char that correspond to space added in
-            # sentences concatenation (" ".join())
-            end_sentence = start_sentence + len(sent.text) + 1
-            # recompute named entities characters offsets
-            for ent in sent.ents:
-                start = start_sentence + ent.start_char
-                end = start + len(ent.text)
-                spans.append(doc.char_span(start, end, label=ent.label_))
-            start_sentence = end_sentence
-
-        doc.set_ents(spans)
-
-        return doc
-
-
-if __name__ == '__main__':
-    start_time = time.time()
-    # Set model, language, file that contains text to analyze
-    model = "en_core_web_sm"
-    language = "en"
-    filename = "data/text_en.txt"
-
-    # Apply text preprocessing
-    sentences = text_preprocessor(open_file(filename))
-    huge_text = " ".join(sentences)
-
-    print(f"* Total characters in document : {len(huge_text)}")
-    print(f"* Total sentences in document : {len(sentences)}")
-
-    # Create pipeline
-    huge_pipeline_linking = spacy.blank(language)
-    huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
-    huge_pipeline_linking.add_pipe('entityfishing', config={"language": language})
-
-    # Apply pipeline
-    doc_linked = huge_pipeline_linking(huge_text)
-
-    # Test
-    for ent in doc_linked.ents:
-        print(ent.text, ent.label_, ent._.kb_qid)
-
-    print("--- %s seconds ---" % (time.time() - start_time))
\ No newline at end of file
diff --git a/spacyfishing/entity_fishing_linker.py b/spacyfishing/entity_fishing_linker.py
index 87e64d2..c53d5f2 100644
--- a/spacyfishing/entity_fishing_linker.py
+++ b/spacyfishing/entity_fishing_linker.py
@@ -6,13 +6,14 @@
 as disambiguation and entity linking component.
 """
 
+import requests
 import concurrent.futures
 import json
 import logging
+
 from email import iterators
 from typing import List, Tuple
 
-import requests
 from spacy import util
 from spacy.language import Language
 from spacy.tokens import Doc, Span
@@ -243,24 +244,27 @@ def updated_entities(self, doc: Doc, response: list) -> None:
         :type response: list
         """
         for entity in response:
-            span = doc.char_span(start_idx=entity['offsetStart'],
-                                 end_idx=entity['offsetEnd'])
             try:
-                span._.kb_qid = str(entity['wikidataId'])
-                span._.url_wikidata = self.wikidata_url_base + span._.kb_qid
-            except KeyError:
-                pass
-            try:
-                span._.wikipedia_page_ref = str(entity["wikipediaExternalRef"])
-                # if flag_extra : search other info on entity
-                # => attach extra entity info to span
-                if self.flag_extra:
-                    self.look_extra_informations_on_entity(span, entity)
-            except KeyError:
-                pass
-            try:
-                span._.nerd_score = entity['confidence_score']
-            except KeyError:
+                span = doc.char_span(start_idx=entity['offsetStart'],
+                                     end_idx=entity['offsetEnd'])
+                try:
+                    span._.kb_qid = str(entity['wikidataId'])
+                    span._.url_wikidata = self.wikidata_url_base + span._.kb_qid
+                except KeyError:
+                    pass
+                try:
+                    span._.wikipedia_page_ref = str(entity["wikipediaExternalRef"])
+                    # if flag_extra : search other info on entity
+                    # => attach extra entity info to span
+                    if self.flag_extra:
+                        self.look_extra_informations_on_entity(span, entity)
+                except KeyError:
+                    pass
+                try:
+                    span._.nerd_score = entity['confidence_score']
+                except KeyError:
+                    pass
+            except AttributeError:
                 pass
 
     # ~ Entity-fishing call service methods ~:
@@ -279,7 +283,7 @@ def concept_look_up_batch(self, wiki_id_batch: str) -> List[requests.Response]:
                                          params=self.language,
                                          verbose=self.verbose)
 
-    def disambiguate_text_batch(self, files_batch: List[dict]) -> requests.Response:
+    def disambiguate_text_batch(self, files_batch: List[dict]) -> List[requests.Response]:
         """
         > The function `disambiguate_text_batch` takes a list of dictionaries as input, where each
         dictionary contains the text to be disambiguated and the corresponding language. The function