Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add options to load full documents as Sentence objects #3595

Merged
merged 2 commits into from
Jan 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,8 @@ def __init__(
label_name_map: Optional[dict[str, str]] = None,
banned_sentences: Optional[list[str]] = None,
default_whitespace_after: int = 1,
every_sentence_is_independent: bool = False,
documents_as_sentences: bool = False,
**corpusargs,
) -> None:
r"""Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
Expand Down Expand Up @@ -361,6 +363,8 @@ def __init__(
skip_first_line=skip_first_line,
label_name_map=label_name_map,
default_whitespace_after=default_whitespace_after,
every_sentence_is_independent=every_sentence_is_independent,
documents_as_sentences=documents_as_sentences,
)
for train_file in train_files
]
Expand All @@ -385,6 +389,8 @@ def __init__(
skip_first_line=skip_first_line,
label_name_map=label_name_map,
default_whitespace_after=default_whitespace_after,
every_sentence_is_independent=every_sentence_is_independent,
documents_as_sentences=documents_as_sentences,
)
for test_file in test_files
]
Expand All @@ -409,6 +415,8 @@ def __init__(
skip_first_line=skip_first_line,
label_name_map=label_name_map,
default_whitespace_after=default_whitespace_after,
every_sentence_is_independent=every_sentence_is_independent,
documents_as_sentences=documents_as_sentences,
)
for dev_file in dev_files
]
Expand Down Expand Up @@ -481,10 +489,12 @@ def __init__(
banned_sentences: Optional[list[str]] = None,
in_memory: bool = True,
document_separator_token: Optional[str] = None,
every_sentence_is_independent: bool = False,
encoding: str = "utf-8",
skip_first_line: bool = False,
label_name_map: Optional[dict[str, str]] = None,
default_whitespace_after: int = 1,
documents_as_sentences: bool = False,
) -> None:
r"""Instantiates a column dataset.

Expand All @@ -505,9 +515,17 @@ def __init__(
self.column_delimiter = re.compile(column_delimiter)
self.comment_symbol = comment_symbol
self.document_separator_token = document_separator_token
self.every_sentence_is_independent = every_sentence_is_independent
self.label_name_map = label_name_map
self.banned_sentences = banned_sentences
self.default_whitespace_after = default_whitespace_after
self.documents_as_sentences = documents_as_sentences

if documents_as_sentences and not document_separator_token:
log.error(
"document_as_sentences was set to True, but no document_separator_token was provided. Please set"
"a value for document_separator_token in order to enable the document_as_sentence functionality."
)

# store either Sentence objects in memory, or only file offsets
self.in_memory = in_memory
Expand Down Expand Up @@ -702,6 +720,9 @@ def _convert_lines_to_sentence(
if sentence.to_original_text() == self.document_separator_token:
sentence.is_document_boundary = True

if self.every_sentence_is_independent or self.documents_as_sentences:
sentence.is_document_boundary = True

# add span labels
if span_level_tag_columns:
for span_column in span_level_tag_columns:
Expand Down Expand Up @@ -818,6 +839,13 @@ def _remap_label(self, tag):
return tag

def __line_completes_sentence(self, line: str) -> bool:

if self.documents_as_sentences and self.document_separator_token:
if line.startswith(self.document_separator_token):
return True
else:
return False

sentence_completed = line.isspace() or line == ""
return sentence_completed

Expand Down Expand Up @@ -5035,7 +5063,8 @@ def __init__(
test_file=None,
column_format=columns,
in_memory=in_memory,
sample_missing_splits=False, # No test data is available, so do not shrink dev data for shared task preparation!
sample_missing_splits=False,
# No test data is available, so do not shrink dev data for shared task preparation!
**corpusargs,
)
corpora.append(corpus)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
this O
is O
New B-LOC
York I-LOC

here O
is O
New B-LOC
York I-LOC

I O
like O
New B-LOC
York I-LOC

we O
like O
New B-LOC
York I-LOC

-DOCSTART-

this O
is O
Berlin B-LOC

here O
is O
Berlin B-LOC

I O
like O
Berlin B-LOC

we O
like O
Berlin B-LOC
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
this O
is O
New B-LOC
York I-LOC

here O
is O
New B-LOC
York I-LOC

I O
like O
New B-LOC
York I-LOC

we O
like O
New B-LOC
York I-LOC

-DOCSTART-

this O
is O
Berlin B-LOC

here O
is O
Berlin B-LOC

I O
like O
Berlin B-LOC

we O
like O
Berlin B-LOC

-DOCSTART-
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
this O
is O
New B-LOC
York I-LOC

here O
is O
New B-LOC
York I-LOC

I O
like O
New B-LOC
York I-LOC

we O
like O
New B-LOC
York I-LOC

-DOCSTART-

this O
is O
Berlin B-LOC

here O
is O
Berlin B-LOC

I O
like O
Berlin B-LOC

we O
like O
Berlin B-LOC

-DOCSTART-

this O
is O
New B-LOC
York I-LOC

here O
is O
New B-LOC
York I-LOC

I O
like O
New B-LOC
York I-LOC

we O
like O
New B-LOC
York I-LOC
48 changes: 48 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,54 @@ def test_load_sequence_labeling_data(tasks_base_path):
assert len(corpus.test) == 1


def test_load_sequence_labeling_data_with_boundaries(tasks_base_path):
# get training, test and dev data
corpus = flair.datasets.ColumnCorpus(
tasks_base_path / "trivial" / "trivial_bioes_with_boundaries", column_format={0: "text", 1: "ner"}
)

assert len(corpus.train) == 14
assert len(corpus.dev) == 9
assert len(corpus.test) == 10

# now exclude -DOCSTART- sentences
corpus = flair.datasets.ColumnCorpus(
tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
column_format={0: "text", 1: "ner"},
banned_sentences=["-DOCSTART-"],
)

assert len(corpus.train) == 12
assert len(corpus.dev) == 8
assert len(corpus.test) == 8

assert len(corpus.train[0].right_context(5)) == 5

# now load whole documents as sentences
corpus = flair.datasets.ColumnCorpus(
tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
column_format={0: "text", 1: "ner"},
document_separator_token="-DOCSTART-",
documents_as_sentences=True,
)

assert len(corpus.train) == 3
assert len(corpus.dev) == 2
assert len(corpus.test) == 2

assert len(corpus.train[0].right_context(5)) == 0

# ban each boundary but set each sentence to be independent
corpus = flair.datasets.ColumnCorpus(
tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
column_format={0: "text", 1: "ner"},
banned_sentences=["-DOCSTART-"],
every_sentence_is_independent=True,
)

assert len(corpus.train[0].right_context(5)) == 0


def test_load_sequence_labeling_whitespace_after(tasks_base_path):
# get training, test and dev data
corpus = flair.datasets.ColumnCorpus(
Expand Down
Loading