From 1c85e086966881b9cf82f3ea2093d978bd94a67b Mon Sep 17 00:00:00 2001 From: alanakbik Date: Sat, 11 Jan 2025 15:17:48 +0100 Subject: [PATCH 1/2] Add options to load full documents as Sentence objects --- flair/datasets/sequence_labeling.py | 25 +++++++- .../trivial_bioes_with_boundaries/dev.txt | 37 ++++++++++++ .../trivial_bioes_with_boundaries/test.txt | 39 ++++++++++++ .../trivial_bioes_with_boundaries/train.txt | 59 +++++++++++++++++++ tests/test_datasets.py | 48 +++++++++++++++ 5 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt create mode 100644 tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt create mode 100644 tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index b2ab2f45dd..479e9e71e5 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -326,6 +326,8 @@ def __init__( label_name_map: Optional[dict[str, str]] = None, banned_sentences: Optional[list[str]] = None, default_whitespace_after: int = 1, + every_sentence_is_independent: bool = False, + documents_as_sentences: bool = False, **corpusargs, ) -> None: r"""Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. @@ -361,6 +363,8 @@ def __init__( skip_first_line=skip_first_line, label_name_map=label_name_map, default_whitespace_after=default_whitespace_after, + every_sentence_is_independent=every_sentence_is_independent, + documents_as_sentences=documents_as_sentences, ) for train_file in train_files ] @@ -385,6 +389,8 @@ def __init__( skip_first_line=skip_first_line, label_name_map=label_name_map, default_whitespace_after=default_whitespace_after, + every_sentence_is_independent=every_sentence_is_independent, + documents_as_sentences=documents_as_sentences, ) for test_file in test_files ] @@ -409,6 +415,8 @@ def __init__( skip_first_line=skip_first_line, label_name_map=label_name_map, default_whitespace_after=default_whitespace_after, + every_sentence_is_independent=every_sentence_is_independent, + documents_as_sentences=documents_as_sentences, ) for dev_file in dev_files ] @@ -481,10 +489,12 @@ def __init__( banned_sentences: Optional[list[str]] = None, in_memory: bool = True, document_separator_token: Optional[str] = None, + every_sentence_is_independent: bool = False, encoding: str = "utf-8", skip_first_line: bool = False, label_name_map: Optional[dict[str, str]] = None, default_whitespace_after: int = 1, + documents_as_sentences: bool = False, ) -> None: r"""Instantiates a column dataset. @@ -505,9 +515,11 @@ def __init__( self.column_delimiter = re.compile(column_delimiter) self.comment_symbol = comment_symbol self.document_separator_token = document_separator_token + self.every_sentence_is_independent = every_sentence_is_independent self.label_name_map = label_name_map self.banned_sentences = banned_sentences self.default_whitespace_after = default_whitespace_after + self.documents_as_sentences = documents_as_sentences # store either Sentence objects in memory, or only file offsets self.in_memory = in_memory @@ -702,6 +714,9 @@ def _convert_lines_to_sentence( if sentence.to_original_text() == self.document_separator_token: sentence.is_document_boundary = True + if self.every_sentence_is_independent or self.documents_as_sentences: + sentence.is_document_boundary = True + # add span labels if span_level_tag_columns: for span_column in span_level_tag_columns: @@ -818,6 +833,13 @@ def _remap_label(self, tag): return tag def __line_completes_sentence(self, line: str) -> bool: + + if self.documents_as_sentences: + if line.startswith(self.document_separator_token): + return True + else: + return False + sentence_completed = line.isspace() or line == "" return sentence_completed @@ -5035,7 +5057,8 @@ def __init__( test_file=None, column_format=columns, in_memory=in_memory, - sample_missing_splits=False, # No test data is available, so do not shrink dev data for shared task preparation! + sample_missing_splits=False, + # No test data is available, so do not shrink dev data for shared task preparation! **corpusargs, ) corpora.append(corpus) diff --git a/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt new file mode 100644 index 0000000000..b741ce5ab7 --- /dev/null +++ b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt @@ -0,0 +1,37 @@ +this O +is O +New B-LOC +York I-LOC + +here O +is O +New B-LOC +York I-LOC + +I O +like O +New B-LOC +York I-LOC + +we O +like O +New B-LOC +York I-LOC + +-DOCSTART- + +this O +is O +Berlin B-LOC + +here O +is O +Berlin B-LOC + +I O +like O +Berlin B-LOC + +we O +like O +Berlin B-LOC \ No newline at end of file diff --git a/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt new file mode 100644 index 0000000000..64a127bd88 --- /dev/null +++ b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt @@ -0,0 +1,39 @@ +this O +is O +New B-LOC +York I-LOC + +here O +is O +New B-LOC +York I-LOC + +I O +like O +New B-LOC +York I-LOC + +we O +like O +New B-LOC +York I-LOC + +-DOCSTART- + +this O +is O +Berlin B-LOC + +here O +is O +Berlin B-LOC + +I O +like O +Berlin B-LOC + +we O +like O +Berlin B-LOC + +-DOCSTART- \ No newline at end of file diff --git a/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt new file mode 100644 index 0000000000..4f934bcfd5 --- /dev/null +++ b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt @@ -0,0 +1,59 @@ +this O +is O +New B-LOC +York I-LOC + +here O +is O +New B-LOC +York I-LOC + +I O +like O +New B-LOC +York I-LOC + +we O +like O +New B-LOC +York I-LOC + +-DOCSTART- + +this O +is O +Berlin B-LOC + +here O +is O +Berlin B-LOC + +I O +like O +Berlin B-LOC + +we O +like O +Berlin B-LOC + +-DOCSTART- + +this O +is O +New B-LOC +York I-LOC + +here O +is O +New B-LOC +York I-LOC + +I O +like O +New B-LOC +York I-LOC + +we O +like O +New B-LOC +York I-LOC \ No newline at end of file diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 25a99f87e0..8e6a7019b0 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -75,6 +75,54 @@ def test_load_sequence_labeling_data(tasks_base_path): assert len(corpus.test) == 1 +def test_load_sequence_labeling_data_with_boundaries(tasks_base_path): + # get training, test and dev data + corpus = flair.datasets.ColumnCorpus( + tasks_base_path / "trivial" / "trivial_bioes_with_boundaries", column_format={0: "text", 1: "ner"} + ) + + assert len(corpus.train) == 14 + assert len(corpus.dev) == 9 + assert len(corpus.test) == 10 + + # now exclude -DOCSTART- sentences + corpus = flair.datasets.ColumnCorpus( + tasks_base_path / "trivial" / "trivial_bioes_with_boundaries", + column_format={0: "text", 1: "ner"}, + banned_sentences=["-DOCSTART-"], + ) + + assert len(corpus.train) == 12 + assert len(corpus.dev) == 8 + assert len(corpus.test) == 8 + + assert len(corpus.train[0].right_context(5)) == 5 + + # now load whole documents as sentences + corpus = flair.datasets.ColumnCorpus( + tasks_base_path / "trivial" / "trivial_bioes_with_boundaries", + column_format={0: "text", 1: "ner"}, + document_separator_token="-DOCSTART-", + documents_as_sentences=True, + ) + + assert len(corpus.train) == 3 + assert len(corpus.dev) == 2 + assert len(corpus.test) == 2 + + assert len(corpus.train[0].right_context(5)) == 0 + + # ban each boundary but set each sentence to be independent + corpus = flair.datasets.ColumnCorpus( + tasks_base_path / "trivial" / "trivial_bioes_with_boundaries", + column_format={0: "text", 1: "ner"}, + banned_sentences=["-DOCSTART-"], + every_sentence_is_independent=True, + ) + + assert len(corpus.train[0].right_context(5)) == 0 + + def test_load_sequence_labeling_whitespace_after(tasks_base_path): # get training, test and dev data corpus = flair.datasets.ColumnCorpus( From 06a5c0cad99e7576883d359dddcbc8d54bb01d9e Mon Sep 17 00:00:00 2001 From: alanakbik Date: Sat, 11 Jan 2025 15:42:13 +0100 Subject: [PATCH 2/2] Mypy fix --- flair/datasets/sequence_labeling.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 479e9e71e5..80fc6d38ba 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -521,6 +521,12 @@ def __init__( self.default_whitespace_after = default_whitespace_after self.documents_as_sentences = documents_as_sentences + if documents_as_sentences and not document_separator_token: + log.error( + "document_as_sentences was set to True, but no document_separator_token was provided. Please set" + "a value for document_separator_token in order to enable the document_as_sentence functionality." + ) + # store either Sentence objects in memory, or only file offsets self.in_memory = in_memory @@ -834,7 +840,7 @@ def _remap_label(self, tag): def __line_completes_sentence(self, line: str) -> bool: - if self.documents_as_sentences: + if self.documents_as_sentences and self.document_separator_token: if line.startswith(self.document_separator_token): return True else: