From 1c85e086966881b9cf82f3ea2093d978bd94a67b Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Sat, 11 Jan 2025 15:17:48 +0100
Subject: [PATCH 1/2] Add options to load full documents as Sentence objects

---
 flair/datasets/sequence_labeling.py           | 25 +++++++-
 .../trivial_bioes_with_boundaries/dev.txt     | 37 ++++++++++++
 .../trivial_bioes_with_boundaries/test.txt    | 39 ++++++++++++
 .../trivial_bioes_with_boundaries/train.txt   | 59 +++++++++++++++++++
 tests/test_datasets.py                        | 48 +++++++++++++++
 5 files changed, 207 insertions(+), 1 deletion(-)
 create mode 100644 tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt
 create mode 100644 tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt
 create mode 100644 tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index b2ab2f45dd..479e9e71e5 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -326,6 +326,8 @@ def __init__(
         label_name_map: Optional[dict[str, str]] = None,
         banned_sentences: Optional[list[str]] = None,
         default_whitespace_after: int = 1,
+        every_sentence_is_independent: bool = False,
+        documents_as_sentences: bool = False,
         **corpusargs,
     ) -> None:
         r"""Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
@@ -361,6 +363,8 @@ def __init__(
                         skip_first_line=skip_first_line,
                         label_name_map=label_name_map,
                         default_whitespace_after=default_whitespace_after,
+                        every_sentence_is_independent=every_sentence_is_independent,
+                        documents_as_sentences=documents_as_sentences,
                     )
                     for train_file in train_files
                 ]
@@ -385,6 +389,8 @@ def __init__(
                         skip_first_line=skip_first_line,
                         label_name_map=label_name_map,
                         default_whitespace_after=default_whitespace_after,
+                        every_sentence_is_independent=every_sentence_is_independent,
+                        documents_as_sentences=documents_as_sentences,
                     )
                     for test_file in test_files
                 ]
@@ -409,6 +415,8 @@ def __init__(
                         skip_first_line=skip_first_line,
                         label_name_map=label_name_map,
                         default_whitespace_after=default_whitespace_after,
+                        every_sentence_is_independent=every_sentence_is_independent,
+                        documents_as_sentences=documents_as_sentences,
                     )
                     for dev_file in dev_files
                 ]
@@ -481,10 +489,12 @@ def __init__(
         banned_sentences: Optional[list[str]] = None,
         in_memory: bool = True,
         document_separator_token: Optional[str] = None,
+        every_sentence_is_independent: bool = False,
         encoding: str = "utf-8",
         skip_first_line: bool = False,
         label_name_map: Optional[dict[str, str]] = None,
         default_whitespace_after: int = 1,
+        documents_as_sentences: bool = False,
     ) -> None:
         r"""Instantiates a column dataset.
 
@@ -505,9 +515,11 @@ def __init__(
         self.column_delimiter = re.compile(column_delimiter)
         self.comment_symbol = comment_symbol
         self.document_separator_token = document_separator_token
+        self.every_sentence_is_independent = every_sentence_is_independent
         self.label_name_map = label_name_map
         self.banned_sentences = banned_sentences
         self.default_whitespace_after = default_whitespace_after
+        self.documents_as_sentences = documents_as_sentences
 
         # store either Sentence objects in memory, or only file offsets
         self.in_memory = in_memory
@@ -702,6 +714,9 @@ def _convert_lines_to_sentence(
         if sentence.to_original_text() == self.document_separator_token:
             sentence.is_document_boundary = True
 
+        if self.every_sentence_is_independent or self.documents_as_sentences:
+            sentence.is_document_boundary = True
+
         # add span labels
         if span_level_tag_columns:
             for span_column in span_level_tag_columns:
@@ -818,6 +833,13 @@ def _remap_label(self, tag):
         return tag
 
     def __line_completes_sentence(self, line: str) -> bool:
+
+        if self.documents_as_sentences:
+            if line.startswith(self.document_separator_token):
+                return True
+            else:
+                return False
+
         sentence_completed = line.isspace() or line == ""
         return sentence_completed
 
@@ -5035,7 +5057,8 @@ def __init__(
                 test_file=None,
                 column_format=columns,
                 in_memory=in_memory,
-                sample_missing_splits=False,  # No test data is available, so do not shrink dev data for shared task preparation!
+                sample_missing_splits=False,
+                # No test data is available, so do not shrink dev data for shared task preparation!
                 **corpusargs,
             )
             corpora.append(corpus)
diff --git a/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt
new file mode 100644
index 0000000000..b741ce5ab7
--- /dev/null
+++ b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt
@@ -0,0 +1,37 @@
+this O
+is O
+New B-LOC
+York I-LOC
+
+here O
+is O
+New B-LOC
+York I-LOC
+
+I O
+like O
+New B-LOC
+York I-LOC
+
+we O
+like O
+New B-LOC
+York I-LOC
+
+-DOCSTART-
+
+this O
+is O
+Berlin B-LOC
+
+here O
+is O
+Berlin B-LOC
+
+I O
+like O
+Berlin B-LOC
+
+we O
+like O
+Berlin B-LOC
\ No newline at end of file
diff --git a/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt
new file mode 100644
index 0000000000..64a127bd88
--- /dev/null
+++ b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt
@@ -0,0 +1,39 @@
+this O
+is O
+New B-LOC
+York I-LOC
+
+here O
+is O
+New B-LOC
+York I-LOC
+
+I O
+like O
+New B-LOC
+York I-LOC
+
+we O
+like O
+New B-LOC
+York I-LOC
+
+-DOCSTART-
+
+this O
+is O
+Berlin B-LOC
+
+here O
+is O
+Berlin B-LOC
+
+I O
+like O
+Berlin B-LOC
+
+we O
+like O
+Berlin B-LOC
+
+-DOCSTART-
\ No newline at end of file
diff --git a/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt
new file mode 100644
index 0000000000..4f934bcfd5
--- /dev/null
+++ b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt
@@ -0,0 +1,59 @@
+this O
+is O
+New B-LOC
+York I-LOC
+
+here O
+is O
+New B-LOC
+York I-LOC
+
+I O
+like O
+New B-LOC
+York I-LOC
+
+we O
+like O
+New B-LOC
+York I-LOC
+
+-DOCSTART-
+
+this O
+is O
+Berlin B-LOC
+
+here O
+is O
+Berlin B-LOC
+
+I O
+like O
+Berlin B-LOC
+
+we O
+like O
+Berlin B-LOC
+
+-DOCSTART-
+
+this O
+is O
+New B-LOC
+York I-LOC
+
+here O
+is O
+New B-LOC
+York I-LOC
+
+I O
+like O
+New B-LOC
+York I-LOC
+
+we O
+like O
+New B-LOC
+York I-LOC
\ No newline at end of file
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 25a99f87e0..8e6a7019b0 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -75,6 +75,54 @@ def test_load_sequence_labeling_data(tasks_base_path):
     assert len(corpus.test) == 1
 
 
+def test_load_sequence_labeling_data_with_boundaries(tasks_base_path):
+    # get training, test and dev data
+    corpus = flair.datasets.ColumnCorpus(
+        tasks_base_path / "trivial" / "trivial_bioes_with_boundaries", column_format={0: "text", 1: "ner"}
+    )
+
+    assert len(corpus.train) == 14
+    assert len(corpus.dev) == 9
+    assert len(corpus.test) == 10
+
+    # now exclude -DOCSTART- sentences
+    corpus = flair.datasets.ColumnCorpus(
+        tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
+        column_format={0: "text", 1: "ner"},
+        banned_sentences=["-DOCSTART-"],
+    )
+
+    assert len(corpus.train) == 12
+    assert len(corpus.dev) == 8
+    assert len(corpus.test) == 8
+
+    assert len(corpus.train[0].right_context(5)) == 5
+
+    # now load whole documents as sentences
+    corpus = flair.datasets.ColumnCorpus(
+        tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
+        column_format={0: "text", 1: "ner"},
+        document_separator_token="-DOCSTART-",
+        documents_as_sentences=True,
+    )
+
+    assert len(corpus.train) == 3
+    assert len(corpus.dev) == 2
+    assert len(corpus.test) == 2
+
+    assert len(corpus.train[0].right_context(5)) == 0
+
+    # ban each boundary but set each sentence to be independent
+    corpus = flair.datasets.ColumnCorpus(
+        tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
+        column_format={0: "text", 1: "ner"},
+        banned_sentences=["-DOCSTART-"],
+        every_sentence_is_independent=True,
+    )
+
+    assert len(corpus.train[0].right_context(5)) == 0
+
+
 def test_load_sequence_labeling_whitespace_after(tasks_base_path):
     # get training, test and dev data
     corpus = flair.datasets.ColumnCorpus(

From 06a5c0cad99e7576883d359dddcbc8d54bb01d9e Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Sat, 11 Jan 2025 15:42:13 +0100
Subject: [PATCH 2/2] Mypy fix

---
 flair/datasets/sequence_labeling.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 479e9e71e5..80fc6d38ba 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -521,6 +521,12 @@ def __init__(
         self.default_whitespace_after = default_whitespace_after
         self.documents_as_sentences = documents_as_sentences
 
+        if documents_as_sentences and not document_separator_token:
+            log.error(
+                "document_as_sentences was set to True, but no document_separator_token was provided. Please set"
+                "a value for document_separator_token in order to enable the document_as_sentence functionality."
+            )
+
         # store either Sentence objects in memory, or only file offsets
         self.in_memory = in_memory
 
@@ -834,7 +840,7 @@ def _remap_label(self, tag):
 
     def __line_completes_sentence(self, line: str) -> bool:
 
-        if self.documents_as_sentences:
+        if self.documents_as_sentences and self.document_separator_token:
             if line.startswith(self.document_separator_token):
                 return True
             else: