Skip to content

Commit

Permalink
#4 - Add Levenshtein string matching recommender
Browse files Browse the repository at this point in the history
- Fix error when annotation has no label
- Use most linked label as prediction
  • Loading branch information
jcklie committed Nov 25, 2019
1 parent b4cfede commit 19f16f7
Showing 1 changed file with 21 additions and 4 deletions.
25 changes: 21 additions & 4 deletions ariadne/contrib/stringmatcher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from collections import defaultdict
from itertools import chain
from typing import List

Expand All @@ -24,14 +25,28 @@ def fit(self, documents: List[TrainingDocument], layer: str, feature: str, proje
mentions = []
labels = []

counts = defaultdict(lambda: defaultdict(int))

for document in documents:
cas = document.cas
for annotation in cas.select(layer):
mention = annotation.get_covered_text()
mention = annotation.get_covered_text().lower()
label = getattr(annotation, feature)

mentions.append(mention)
labels.append(label)
if not label:
label = ""

counts[mention][label] += 1

# Just use the entity that was most often linked with this mention
for mention, candidates in counts.items():
if candidates:
label = max(candidates, key=candidates.get)
else:
label = ""

mentions.append(mention)
labels.append(label)

le = LabelEncoder()
le.fit(labels)
Expand Down Expand Up @@ -72,5 +87,7 @@ def _generate_candidates(self, cas: Cas, n: int):
yield (begin, end, text)

def _get_fst_path(self, user_id: str) -> str:
p = self.model_directory / self.name / f"model_{user_id}.fst"
parent = self.model_directory / self.name
parent.mkdir(exist_ok=True, parents=True)
p = parent / f"model_{user_id}.fst"
return str(p)

0 comments on commit 19f16f7

Please sign in to comment.