Skip to content

Commit

Permalink
Merge branch 'master' into filter_relations
Browse files Browse the repository at this point in the history
  • Loading branch information
alanakbik authored Feb 4, 2025
2 parents cbd8be3 + ae592bf commit 863d903
Show file tree
Hide file tree
Showing 10 changed files with 561 additions and 41 deletions.
9 changes: 7 additions & 2 deletions flair/class_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
import inspect
from collections.abc import Iterable
from types import ModuleType
from typing import Any, Optional, TypeVar, Union, overload
from typing import Any, Iterable, List, Optional, Protocol, Type, TypeVar, Union, overload


T = TypeVar("T")


def get_non_abstract_subclasses(cls: type[T]) -> Iterable[type[T]]:
class StringLike(Protocol):
def __str__(self) -> str: ...


def get_non_abstract_subclasses(cls: Type[T]) -> Iterable[Type[T]]:
for subclass in cls.__subclasses__():
yield from get_non_abstract_subclasses(subclass)
if inspect.isabstract(subclass):
Expand Down
2 changes: 1 addition & 1 deletion flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def __init__(
head_id: Optional[int] = None,
whitespace_after: int = 1,
start_position: int = 0,
sentence=None,
sentence: Optional["Sentence"] = None,
) -> None:
super().__init__(sentence=sentence)

Expand Down
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@
NER_ARABIC_ANER,
NER_ARABIC_AQMAR,
NER_BASQUE,
NER_BAVARIAN_WIKI,
NER_CHINESE_WEIBO,
NER_DANISH_DANE,
NER_ENGLISH_MOVIE_COMPLEX,
Expand Down Expand Up @@ -477,6 +478,7 @@
"NER_ARABIC_ANER",
"NER_ARABIC_AQMAR",
"NER_BASQUE",
"NER_BAVARIAN_WIKI",
"NER_CHINESE_WEIBO",
"NER_DANISH_DANE",
"NER_ENGLISH_MOVIE_COMPLEX",
Expand Down
91 changes: 91 additions & 0 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5530,3 +5530,94 @@ def __init__(
corpora,
name="masakha-pos-" + "-".join(languages),
)


class NER_BAVARIAN_WIKI(ColumnCorpus):
def __init__(
self,
fine_grained: bool = False,
revision: str = "main",
base_path: Optional[Union[str, Path]] = None,
in_memory: bool = True,
**corpusargs,
) -> None:
"""Initialize the Bavarian NER Bavarian NER Dataset (BarNER).
The dataset was proposed in the 2024 LREC-COLING paper
"Sebastian, Basti, Wastl?! Recognizing Named Entities in Bavarian Dialectal Data" paper by Peng et al.
:param fine_grained: Defines if the fine-grained or coarse-grained (default) should be used.
:param revision: Defines the revision/commit of BarNER dataset, by default dataset from 'main' branch is used.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param in_memory: If True, keeps dataset in memory giving speedups in training.
"""
base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
dataset_name = self.__class__.__name__.lower()
data_folder = base_path / dataset_name
data_path = flair.cache_root / "datasets" / dataset_name

document_boundary_marker = "-DOCSTART-"

for split in ["train", "dev", "test"]:
# Get original version
original_split_filename = data_path / "original" / f"bar-wiki-{split}.tsv"
if not original_split_filename.is_file():
original_split_url = (
f"https://raw.githubusercontent.com/mainlp/BarNER/{revision}/data/BarNER-final/bar-wiki-{split}.tsv"
)
cached_path(original_split_url, data_path / "original")

# Add sentence boundary marker
modified_split_filename = data_path / f"bar-wiki-{split}.tsv"
if not modified_split_filename.is_file():
f_out = open(modified_split_filename, "w", encoding="utf-8")

with open(original_split_filename, encoding="utf-8") as f_p:
for line in f_p:
line = line.strip()
if line.startswith("# newdoc id = "):
f_out.write(f"{document_boundary_marker}\tO\n\n")
continue
if line.startswith("# "):
continue
f_out.write(f"{line}\n")
f_out.close()

columns = {0: "text", 1: "ner"}

label_name_map = None

if not fine_grained:
# Only allowed classes in course setting are: PER, LOC, ORG and MISC.
# All other NEs are normalized to O, except EVENT and WOA are normalized to MISC (cf. Table 3 of paper).
label_name_map = {
"EVENT": "MISC",
"EVENTderiv": "O",
"EVENTpart": "O",
"LANG": "O",
"LANGderiv": "O",
"LANGpart": "O",
"LOCderiv": "O",
"LOCpart": "O",
"MISCderiv": "O",
"MISCpart": "O",
"ORGderiv": "O",
"ORGpart": "O",
"PERderiv": "O",
"PERpart": "O",
"RELIGION": "O",
"RELIGIONderiv": "O",
"WOA": "MISC",
"WOAderiv": "O",
"WOApart": "O",
}

super().__init__(
data_folder,
columns,
in_memory=in_memory,
comment_symbol="# ",
document_separator_token="-DOCSTART-",
label_name_map=label_name_map,
**corpusargs,
)
8 changes: 7 additions & 1 deletion flair/models/multitask_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def evaluate( # type: ignore[override]
main_score = 0.0
all_detailed_results = ""
all_classification_report: dict[str, dict[str, Any]] = {}
scores: dict[Any, float] = {}

for task_id, split in batch_split.items():
result = self.tasks[task_id].evaluate(
Expand Down Expand Up @@ -194,7 +195,12 @@ def evaluate( # type: ignore[override]
)
all_classification_report[task_id] = result.classification_report

scores = {"loss": loss.item() / len(batch_split)}
# Add metrics so they will be available to _publish_eval_result.
for avg_type in ("micro avg", "macro avg"):
for metric_type in ("f1-score", "precision", "recall"):
scores[(task_id, avg_type, metric_type)] = result.classification_report[avg_type][metric_type]

scores["loss"] = loss.item() / len(batch_split)

return Result(
main_score=main_score / len(batch_split),
Expand Down
Loading

0 comments on commit 863d903

Please sign in to comment.