Skip to content

Commit

Permalink
Fix name scoring tweaking
Browse files Browse the repository at this point in the history
  • Loading branch information
mbollmann committed Dec 23, 2024
1 parent d21ab3e commit 28651fd
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 4 deletions.
6 changes: 3 additions & 3 deletions python/acl_anthology/people/name.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ def as_bibtex(self) -> str:
"""
return latex_encode(self.as_last_first())

def score(self) -> int:
def score(self) -> float:
"""
Returns:
A score for this name that is intended for comparing different names that generate the same ID. Names that are more likely to be the correct canonical variant should return higher scores via this function.
"""
name = self.as_first_last()
# Prefer longer variants
score = len(name)
score = float(len(name))
# Prefer variants with non-ASCII characters
score += sum((ord(c) > 127) for c in name)
# Penalize upper-case characters after word boundaries
Expand All @@ -103,7 +103,7 @@ def score(self) -> int:
# intended to make a difference when a person has both "C, A B" and "B
# C, A" as names)
if self.first is not None and len(self.first) > len(self.last):
score -= 1
score += 0.5
return score

def slugify(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion python/tests/people/name_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def test_name_scoring():
def test_name_scoring_first_vs_last_name():
n1 = Name("Chan Tai", "Man")
n2 = Name("Chan", "Tai Man")
assert n1.score() < n2.score()
assert n1.score() > n2.score()


def test_name_from_string():
Expand Down
6 changes: 6 additions & 0 deletions python/tests/people/personindex_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,12 @@ def test_build_personindex_automatically(index_with_full_anthology):
assert len(persons) == 1


def test_canonical_name_is_never_a_variant(index_with_full_anthology):
index = index_with_full_anthology
for person in index.values():
assert person.canonical_name.script is None


def test_get_person_coauthors(index_with_full_anthology):
index = index_with_full_anthology
person = index.get_by_name(Name("Kathleen", "Dahlgren"))[0]
Expand Down

0 comments on commit 28651fd

Please sign in to comment.