Replace latexcodec with pylatexenc, using braces-all mode (#4284)

mbollmann · web-flow · commit 75a6a5ba4479 · 2025-01-01T21:56:24.000+01:00
* Replace latexcodec with pylatexenc, using braces-all mode

* Define conversion rules for i+diacritic
diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## [Unreleased]
 
+### Changed
+
+- LaTeX encoding now uses [pylatexenc](https://pylatexenc.readthedocs.io/) instead of latexcodec, and wraps all macros in braces. This should address problems with BibTeX handling, see [#4280](https://github.com/acl-org/acl-anthology/issues/4280).
+
 ## [0.5.0] — 2024-12-25
 
 This release is intended to be feature-complete with regard to generating the entire ACL Anthology website.
diff --git a/python/acl_anthology/utils/latex.py b/python/acl_anthology/utils/latex.py
@@ -14,10 +14,9 @@
 
 from __future__ import annotations
 
-import codecs
 import re
 from functools import lru_cache
-from typing import Optional, TypeAlias, TYPE_CHECKING
+from typing import cast, Optional, TypeAlias, TYPE_CHECKING
 
 if TYPE_CHECKING:
     from ..people.name import NameSpecification
@@ -26,8 +25,33 @@
     SerializableAsBibTeX: TypeAlias = None | str | MarkupText | list[NameSpecification]
     """Any type that can be supplied to `make_bibtex_entry`."""
 
-import latexcodec  # noqa: F401
 
+from pylatexenc.latexencode import (
+    UnicodeToLatexEncoder,
+    UnicodeToLatexConversionRule,
+    RULE_DICT,
+)
+
+LATEXENC = UnicodeToLatexEncoder(
+    conversion_rules=[
+        UnicodeToLatexConversionRule(
+            RULE_DICT,
+            {
+                ord("’"): "'",  # defaults to \textquoteright
+                ord("–"): "--",  # defaults to \textendash
+                ord("—"): "---",  # defaults to \textemdash
+                ord("í"): "\\'i",  # defaults to using dotless \i
+                ord("ì"): "\\`i",
+                ord("î"): "\\^i",
+                ord("ï"): '\\"i',
+            },
+        ),
+        "defaults",
+    ],
+    replacement_latex_protection="braces-all",
+    unknown_char_policy="keep",
+    unknown_char_warning=False,
+)
 
 BIBTEX_FIELD_NEEDS_ENCODING = {"journal", "address", "publisher", "note"}
 """Any BibTeX field whose value should be LaTeX-encoded first."""
@@ -48,8 +72,10 @@
 }
 """A mapping of month names to BibTeX macros."""
 
-RE_OPENING_QUOTE = re.compile(r"(?<!\\)\"\b")
-RE_CLOSING_QUOTE = re.compile(r"(?<!\\)\"")
+RE_OPENING_QUOTE_DOUBLE = re.compile(r"(?<!\\)({''}|'')\b")
+RE_OPENING_QUOTE_SINGLE = re.compile(r"(?<!\\)({'}|')\b")
+RE_CLOSING_QUOTE_DOUBLE = re.compile(r"(?<!\\){''}")
+RE_CLOSING_QUOTE_SINGLE = re.compile(r"(?<!\\){'}")
 RE_HYPHENS_BETWEEN_NUMBERS = re.compile(r"(?<=[0-9])(-|–|—)(?=[0-9])")
 
 
@@ -100,7 +126,7 @@ def latex_encode(text: Optional[str]) -> str:
     """
     if text is None:
         return ""
-    text = str(codecs.encode(text, "ulatex+ascii", "keep"))
+    text = cast(str, LATEXENC.unicode_to_latex(text))
     return text
 
 
@@ -110,14 +136,16 @@ def latex_convert_quotes(text: str) -> str:
         text: An arbitrary string.
 
     Returns:
-        The input string with regular quotes converted into LaTeX quotes.
+        The input string with LaTeX quotes converted into proper opening and closing quotes, removing braces around them, if necessary.
 
     Examples:
-        >>> latex_convert_quotes('This "great" example')
+        >>> latex_convert_quotes("This {''}great{''} example")
         "This ``great'' example"
     """
-    text = RE_OPENING_QUOTE.sub("``", text)
-    text = RE_CLOSING_QUOTE.sub("''", text)
+    text = RE_OPENING_QUOTE_DOUBLE.sub("``", text)
+    text = RE_OPENING_QUOTE_SINGLE.sub("`", text)
+    text = RE_CLOSING_QUOTE_DOUBLE.sub("''", text)
+    text = RE_CLOSING_QUOTE_SINGLE.sub("'", text)
     return text
 
 
diff --git a/python/poetry.lock b/python/poetry.lock
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -11,7 +11,7 @@ module = 'TexSoup.*'
 ignore_missing_imports = true
 
 [[tool.mypy.overrides]]
-module = 'latexcodec.*'
+module = 'pylatexenc.*'
 ignore_missing_imports = true
 
 [[tool.mypy.overrides]]
@@ -59,7 +59,6 @@ classifiers = [
 [tool.poetry.dependencies]
 python = ">=3.10,<3.11.0 || >3.11.0,<3.13"
 docopt = "^0.6.2"
-latexcodec = "^2.0.1"
 lxml = "^4.9.2"
 PyYAML = "^6.0"
 app-paths = "^0.0.7"
@@ -77,6 +76,7 @@ gitpython = "^3.1.37"
 rnc2rng = "^2.6.6"
 citeproc-py = "^0.6.0"
 langcodes = {extras = ["data"], version = "^3.5.0"}
+pylatexenc = "^2.10"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.3.0"
diff --git a/python/tests/collections/paper_test.py b/python/tests/collections/paper_test.py
@@ -187,7 +187,7 @@ def test_paper_roundtrip_xml(xml):
         True,
         """@inproceedings{alvarez-mellado-lignos-2022-detecting,
     title = "Detecting Unassimilated Borrowings in {S}panish: {A}n Annotated Corpus and Approaches to Modeling",
-    author = "\\'Alvarez-Mellado, Elena  and
+    author = "{\\'A}lvarez-Mellado, Elena  and
       Lignos, Constantine",
     editor = "Muresan, Smaranda  and
       Nakov, Preslav  and
@@ -200,15 +200,15 @@ def test_paper_roundtrip_xml(xml):
     url = "https://aclanthology.org/2022.acl-long.268/",
     doi = "10.18653/v1/2022.acl-long.268",
     pages = "3868--3888",
-    abstract = "This work presents a new resource for borrowing identification and analyzes the performance and errors of several models on this task. We introduce a new annotated corpus of Spanish newswire rich in unassimilated lexical borrowings---words from one language that are introduced into another without orthographic adaptation---and use it to evaluate how several sequence labeling models (CRF, BiLSTM-CRF, and Transformer-based models) perform. The corpus contains 370,000 tokens and is larger, more borrowing-dense, OOV-rich, and topic-varied than previous corpora available for this task. Our results show that a BiLSTM-CRF model fed with subword embeddings along with either Transformer-based embeddings pretrained on codeswitched data or a combination of contextualized word embeddings outperforms results obtained by a multilingual BERT-based model."
+    abstract = "This work presents a new resource for borrowing identification and analyzes the performance and errors of several models on this task. We introduce a new annotated corpus of Spanish newswire rich in unassimilated lexical borrowings{---}words from one language that are introduced into another without orthographic adaptation{---}and use it to evaluate how several sequence labeling models (CRF, BiLSTM-CRF, and Transformer-based models) perform. The corpus contains 370,000 tokens and is larger, more borrowing-dense, OOV-rich, and topic-varied than previous corpora available for this task. Our results show that a BiLSTM-CRF model fed with subword embeddings along with either Transformer-based embeddings pretrained on codeswitched data or a combination of contextualized word embeddings outperforms results obtained by a multilingual BERT-based model."
 }""",
     ),
     (
         "2022.acl-long.268",
         False,
         """@inproceedings{alvarez-mellado-lignos-2022-detecting,
     title = "Detecting Unassimilated Borrowings in {S}panish: {A}n Annotated Corpus and Approaches to Modeling",
-    author = "\\'Alvarez-Mellado, Elena  and
+    author = "{\\'A}lvarez-Mellado, Elena  and
       Lignos, Constantine",
     editor = "Muresan, Smaranda  and
       Nakov, Preslav  and
@@ -243,7 +243,7 @@ def test_paper_roundtrip_xml(xml):
         True,
         """@article{oshaughnessy-1989-parsing,
     title = "Parsing with a Small Dictionary for Applications such as Text to Speech",
-    author = "O'Shaughnessy, Douglas D.",
+    author = "O{'}Shaughnessy, Douglas D.",
     editor = "Allen, James F.",
     journal = "Computational Linguistics",
     volume = "15",
diff --git a/python/tests/people/name_test.py b/python/tests/people/name_test.py
@@ -228,4 +228,4 @@ def test_name_from_any():
 
 def test_name_as_bibtex():
     n1 = Name.from_string("André Rieu")
-    assert n1.as_bibtex() == "Rieu, Andr\\'e"
+    assert n1.as_bibtex() == "Rieu, Andr{\\'e}"
diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py
@@ -62,7 +62,7 @@
         {
             "text": "Workshop on Topic A & B",
             "html": "Workshop on Topic A &amp; B",
-            "latex": "Workshop on Topic A \\& B",
+            "latex": "Workshop on Topic A {\\&} B",
         },
     ),
     (
@@ -110,9 +110,31 @@
         {
             "text": "Äöøéÿőßû–",
             "html": "Äöøéÿőßû–",
-            # this is what the modified latexcodec from the acl-anthology repo produces:
-            # "latex": '{\\"A}{\\"o}{\\o}{\\\'e}{\\"y}{\\H{o}}{\\ss}{\\^u}{--}',
-            "latex": '\\"A\\"o\\o \\\'e\\"y\\H o\\ss \\^u--',
+            "latex": '{\\"A}{\\"o}{\\o}{\\\'e}{\\"y}{\\H{o}}{\\ss}{\\^u}{--}',
+        },
+    ),
+    (
+        "Hajič, Jan and Woźniak, Michał",
+        {
+            "text": "Hajič, Jan and Woźniak, Michał",
+            "html": "Hajič, Jan and Woźniak, Michał",
+            "latex": "Haji{\\v{c}}, Jan and Wo{\\'z}niak, Micha{\\l}",
+        },
+    ),
+    (
+        "Žabokrtský, Zdeněk and Ševčíková, Magda",
+        {
+            "text": "Žabokrtský, Zdeněk and Ševčíková, Magda",
+            "html": "Žabokrtský, Zdeněk and Ševčíková, Magda",
+            "latex": "{\\v{Z}}abokrtsk{\\'y}, Zden{\\v{e}}k and {\\v{S}}ev{\\v{c}}{\\'i}kov{\\'a}, Magda",
+        },
+    ),
+    (
+        "íìïîı ÍÌÏÎİ",
+        {
+            "text": "íìïîı ÍÌÏÎİ",
+            "html": "íìïîı ÍÌÏÎİ",
+            "latex": "{\\'i}{\\`i}{\\\"i}{\\^i}{\\i} {\\'I}{\\`I}{\\\"I}{\\^I}{\\.I}",
         },
     ),
     (
diff --git a/python/tests/utils/latex_test.py b/python/tests/utils/latex_test.py
@@ -18,10 +18,10 @@
 from acl_anthology.utils import latex
 
 test_cases_latex = (
-    ('"This is a quotation."', "``This is a quotation.''"),
-    ('This is a "quotation".', "This is a ``quotation''."),
-    ('Can you "please" "convert" this?', "Can you ``please'' ``convert'' this?"),
-    ('My name is "陳大文".', "My name is ``陳大文''."),
+    ("{''}This is a quotation.{''}", "``This is a quotation.''"),
+    ("This is a {''}quotation{''}.", "This is a ``quotation''."),
+    ("Can you 'please' {'}convert{'} this?", "Can you `please' `convert' this?"),
+    ("My name is ''陳大文''.", "My name is ``陳大文''."),
 )
 
 
@@ -36,7 +36,8 @@ def test_namespecs_to_bibtex():
     assert latex.namespecs_to_bibtex([]) == ""
     assert latex.namespecs_to_bibtex([ns1]) == "Chan, Tai Man"
     assert (
-        latex.namespecs_to_bibtex([ns1, ns2]) == "Chan, Tai Man  and\n      Do\\'e, John"
+        latex.namespecs_to_bibtex([ns1, ns2])
+        == "Chan, Tai Man  and\n      Do{\\'e}, John"
     )
 
 
@@ -66,18 +67,18 @@ def test_make_bibtex_entry():
         ("editor", []),
         ("title", MarkupText.from_string("Thé Papér")),
         ("booktitle", MarkupText.from_string('My "Conference"')),
-        ("address", '"Montréal"'),
+        ("address", "Montréal"),
         ("doi", "10.000.a_b_c"),
         ("publisher", ""),
         ("month", "February"),
         ("note", None),
         ("pages", "1–7"),
     ]
     expected = """@inproceedings{my-entry,
-    author = "Do\\'e, John",
-    title = "Th\\'e Pap\\'er",
+    author = "Do{\\'e}, John",
+    title = "Th{\\'e} Pap{\\'e}r",
     booktitle = "My ``Conference''",
-    address = {"Montr\\'eal"},
+    address = "Montr{\\'e}al",
     doi = "10.000.a_b_c",
     month = feb,
     pages = "1--7"