From 4281c594a721c1e92cc75abe03e417ef22fc3338 Mon Sep 17 00:00:00 2001 From: Lucaterre Date: Wed, 5 Oct 2022 09:53:21 +0200 Subject: [PATCH] Catch Attribute exception in updated_entity + update doc --- CONTRIBUTE.md => CONTRIBUTING.md | 0 README.md | 21 ++---- examples/process_long_text.py | 97 --------------------------- spacyfishing/entity_fishing_linker.py | 42 ++++++------ 4 files changed, 30 insertions(+), 130 deletions(-) rename CONTRIBUTE.md => CONTRIBUTING.md (100%) delete mode 100644 examples/process_long_text.py diff --git a/CONTRIBUTE.md b/CONTRIBUTING.md similarity index 100% rename from CONTRIBUTE.md rename to CONTRIBUTING.md diff --git a/README.md b/README.md index e4754d1..5cb54aa 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,6 @@ This extension allows using entity-fishing tool as a spaCy pipeline component to - [Get extra information from Wikidata](#Get-extra-information-from-Wikidata) - [Use other language](#Use-other-language) - [Get information about entity fishing API response](#Get-information-about-entity-fishing-API-response) - - [How to process a long text?](#How-to-process-a-long-text?) * [Configuration parameters](#Configuration-parameters) * [Attributes](#Attributes) * [Recommendations](#Recommendations) @@ -496,18 +495,6 @@ doc._.metadata } ``` -### How to process a long text? - -Process NER and disambiguate a long text can be really tricky. -In fact, spaCy can be raised an exception due to the default limit parameter `nlp.max_length`. -The strategy here is to pass a text as batch of sentences with [`nlp.pipe()`](https://spacy.io/api/language#pipe) method and, -then pass entities to spacyfishing with all context (not only the sentences, to help disambiguation) and -all entities with continuous characters offsets (start and end characters positions are re-calculated). -You can use a provided script [`process_long_text.py`](examples/process_long_text.py) that can help to process huge text. -For example, a text with `2 073` sentences that contains `12 901` entities to disambiguate can be processed in about a minute (with no extra information) -and in less than 1 minute 30 (with extra information and properties filter applied). - - ## Configuration parameters ``` @@ -618,4 +605,10 @@ Entity-fishing is tool created by [Patrice Lopez](https://github.com/kermitt2) ( Awesome logo designed by [Alix Chagué](https://github.com/alix-tz). -Special thanks to [@HugoSchtr](https://github.com/HugoSchtr), [@gromag](https://github.com/gromag) for documentation review. +Special thanks to + +- Documentation review: +[@HugoSchtr](https://github.com/HugoSchtr), [@gromag](https://github.com/gromag) + +- Code contribution: +[@davidberenstein1957](https://github.com/davidberenstein1957) diff --git a/examples/process_long_text.py b/examples/process_long_text.py deleted file mode 100644 index 03362e3..0000000 --- a/examples/process_long_text.py +++ /dev/null @@ -1,97 +0,0 @@ -# -*- coding: UTF-8 -*- -#!/usr/bin/env python3 - - -""" -process_long_text.py - -This script is a customizable example that allows you to work on large texts. - -Process: - - 1. Create a custom preprocess text function - (sentences tokenization and clean processes) - 2. Create a pipeline with two components: - - custom_ner: an intermediate pipeline apply ner with batch - method Language.pipe() and recompute entities offsets in context of all text; - - spacy_fishing: apply disambiguation and linking on all preprocess text (to keep more context as possible); - 3. Apply complete pipeline on text and retrieve results. -""" - -import time - -import spacy -from spacy import Language -from spacy.tokens import Doc - - -def open_file(file_name: str) -> str: - with open(file_name, mode="r", encoding="utf-8") as f: - return f.read() - -# use the tokenizer and apply the cleaning functions -# of your choice -def text_preprocessor(text: str) -> list: - return [sentence.strip() for sentence in text.split("\n") if sentence != ""] - - -@Language.factory("custom_ner", default_config={ - "model_name": "", - "sentences_to_process": [] -}) -class CustomNer: - def __init__(self, - nlp: Language, - name: str, - model_name: str, - sentences_to_process: list): - self.nlp = nlp - self.pipeline_ner = spacy.load(model_name, disable=["tok2vec", "morphologizer", "parser", "senter", "attribute_ruler", "lemmatizer"]) - self.sentences = sentences_to_process - - def __call__(self, doc: Doc): - start_sentence = 0 - spans = [] - for sent in self.pipeline_ner.pipe(self.sentences): - # add 1 char that correspond to space added in - # sentences concatenation (" ".join()) - end_sentence = start_sentence + len(sent.text) + 1 - # recompute named entities characters offsets - for ent in sent.ents: - start = start_sentence + ent.start_char - end = start + len(ent.text) - spans.append(doc.char_span(start, end, label=ent.label_)) - start_sentence = end_sentence - - doc.set_ents(spans) - - return doc - - -if __name__ == '__main__': - start_time = time.time() - # Set model, language, file that contains text to analyze - model = "en_core_web_sm" - language = "en" - filename = "data/text_en.txt" - - # Apply text preprocessing - sentences = text_preprocessor(open_file(filename)) - huge_text = " ".join(sentences) - - print(f"* Total characters in document : {len(huge_text)}") - print(f"* Total sentences in document : {len(sentences)}") - - # Create pipeline - huge_pipeline_linking = spacy.blank(language) - huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences}) - huge_pipeline_linking.add_pipe('entityfishing', config={"language": language}) - - # Apply pipeline - doc_linked = huge_pipeline_linking(huge_text) - - # Test - for ent in doc_linked.ents: - print(ent.text, ent.label_, ent._.kb_qid) - - print("--- %s seconds ---" % (time.time() - start_time)) \ No newline at end of file diff --git a/spacyfishing/entity_fishing_linker.py b/spacyfishing/entity_fishing_linker.py index 87e64d2..c53d5f2 100644 --- a/spacyfishing/entity_fishing_linker.py +++ b/spacyfishing/entity_fishing_linker.py @@ -6,13 +6,14 @@ as disambiguation and entity linking component. """ +import requests import concurrent.futures import json import logging + from email import iterators from typing import List, Tuple -import requests from spacy import util from spacy.language import Language from spacy.tokens import Doc, Span @@ -243,24 +244,27 @@ def updated_entities(self, doc: Doc, response: list) -> None: :type response: list """ for entity in response: - span = doc.char_span(start_idx=entity['offsetStart'], - end_idx=entity['offsetEnd']) try: - span._.kb_qid = str(entity['wikidataId']) - span._.url_wikidata = self.wikidata_url_base + span._.kb_qid - except KeyError: - pass - try: - span._.wikipedia_page_ref = str(entity["wikipediaExternalRef"]) - # if flag_extra : search other info on entity - # => attach extra entity info to span - if self.flag_extra: - self.look_extra_informations_on_entity(span, entity) - except KeyError: - pass - try: - span._.nerd_score = entity['confidence_score'] - except KeyError: + span = doc.char_span(start_idx=entity['offsetStart'], + end_idx=entity['offsetEnd']) + try: + span._.kb_qid = str(entity['wikidataId']) + span._.url_wikidata = self.wikidata_url_base + span._.kb_qid + except KeyError: + pass + try: + span._.wikipedia_page_ref = str(entity["wikipediaExternalRef"]) + # if flag_extra : search other info on entity + # => attach extra entity info to span + if self.flag_extra: + self.look_extra_informations_on_entity(span, entity) + except KeyError: + pass + try: + span._.nerd_score = entity['confidence_score'] + except KeyError: + pass + except AttributeError: pass # ~ Entity-fishing call service methods ~: @@ -279,7 +283,7 @@ def concept_look_up_batch(self, wiki_id_batch: str) -> List[requests.Response]: params=self.language, verbose=self.verbose) - def disambiguate_text_batch(self, files_batch: List[dict]) -> requests.Response: + def disambiguate_text_batch(self, files_batch: List[dict]) -> List[requests.Response]: """ > The function `disambiguate_text_batch` takes a list of dictionaries as input, where each dictionary contains the text to be disambiguated and the corresponding language. The function