Skip to content

Commit 75a6a5b

Browse files
authored
Replace latexcodec with pylatexenc, using braces-all mode (#4284)
* Replace latexcodec with pylatexenc, using braces-all mode * Define conversion rules for i+diacritic
1 parent 42b0eae commit 75a6a5b

File tree

8 files changed

+96
-45
lines changed

8 files changed

+96
-45
lines changed

python/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
## [Unreleased]
44

5+
### Changed
6+
7+
- LaTeX encoding now uses [pylatexenc](https://pylatexenc.readthedocs.io/) instead of latexcodec, and wraps all macros in braces. This should address problems with BibTeX handling, see [#4280](https://github.com/acl-org/acl-anthology/issues/4280).
8+
59
## [0.5.0] — 2024-12-25
610

711
This release is intended to be feature-complete with regard to generating the entire ACL Anthology website.

python/acl_anthology/utils/latex.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,9 @@
1414

1515
from __future__ import annotations
1616

17-
import codecs
1817
import re
1918
from functools import lru_cache
20-
from typing import Optional, TypeAlias, TYPE_CHECKING
19+
from typing import cast, Optional, TypeAlias, TYPE_CHECKING
2120

2221
if TYPE_CHECKING:
2322
from ..people.name import NameSpecification
@@ -26,8 +25,33 @@
2625
SerializableAsBibTeX: TypeAlias = None | str | MarkupText | list[NameSpecification]
2726
"""Any type that can be supplied to `make_bibtex_entry`."""
2827

29-
import latexcodec # noqa: F401
3028

29+
from pylatexenc.latexencode import (
30+
UnicodeToLatexEncoder,
31+
UnicodeToLatexConversionRule,
32+
RULE_DICT,
33+
)
34+
35+
LATEXENC = UnicodeToLatexEncoder(
36+
conversion_rules=[
37+
UnicodeToLatexConversionRule(
38+
RULE_DICT,
39+
{
40+
ord("’"): "'", # defaults to \textquoteright
41+
ord("–"): "--", # defaults to \textendash
42+
ord("—"): "---", # defaults to \textemdash
43+
ord("í"): "\\'i", # defaults to using dotless \i
44+
ord("ì"): "\\`i",
45+
ord("î"): "\\^i",
46+
ord("ï"): '\\"i',
47+
},
48+
),
49+
"defaults",
50+
],
51+
replacement_latex_protection="braces-all",
52+
unknown_char_policy="keep",
53+
unknown_char_warning=False,
54+
)
3155

3256
BIBTEX_FIELD_NEEDS_ENCODING = {"journal", "address", "publisher", "note"}
3357
"""Any BibTeX field whose value should be LaTeX-encoded first."""
@@ -48,8 +72,10 @@
4872
}
4973
"""A mapping of month names to BibTeX macros."""
5074

51-
RE_OPENING_QUOTE = re.compile(r"(?<!\\)\"\b")
52-
RE_CLOSING_QUOTE = re.compile(r"(?<!\\)\"")
75+
RE_OPENING_QUOTE_DOUBLE = re.compile(r"(?<!\\)({''}|'')\b")
76+
RE_OPENING_QUOTE_SINGLE = re.compile(r"(?<!\\)({'}|')\b")
77+
RE_CLOSING_QUOTE_DOUBLE = re.compile(r"(?<!\\){''}")
78+
RE_CLOSING_QUOTE_SINGLE = re.compile(r"(?<!\\){'}")
5379
RE_HYPHENS_BETWEEN_NUMBERS = re.compile(r"(?<=[0-9])(-|–|—)(?=[0-9])")
5480

5581

@@ -100,7 +126,7 @@ def latex_encode(text: Optional[str]) -> str:
100126
"""
101127
if text is None:
102128
return ""
103-
text = str(codecs.encode(text, "ulatex+ascii", "keep"))
129+
text = cast(str, LATEXENC.unicode_to_latex(text))
104130
return text
105131

106132

@@ -110,14 +136,16 @@ def latex_convert_quotes(text: str) -> str:
110136
text: An arbitrary string.
111137
112138
Returns:
113-
The input string with regular quotes converted into LaTeX quotes.
139+
The input string with LaTeX quotes converted into proper opening and closing quotes, removing braces around them, if necessary.
114140
115141
Examples:
116-
>>> latex_convert_quotes('This "great" example')
142+
>>> latex_convert_quotes("This {''}great{''} example")
117143
"This ``great'' example"
118144
"""
119-
text = RE_OPENING_QUOTE.sub("``", text)
120-
text = RE_CLOSING_QUOTE.sub("''", text)
145+
text = RE_OPENING_QUOTE_DOUBLE.sub("``", text)
146+
text = RE_OPENING_QUOTE_SINGLE.sub("`", text)
147+
text = RE_CLOSING_QUOTE_DOUBLE.sub("''", text)
148+
text = RE_CLOSING_QUOTE_SINGLE.sub("'", text)
121149
return text
122150

123151

python/poetry.lock

Lines changed: 11 additions & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

python/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ module = 'TexSoup.*'
1111
ignore_missing_imports = true
1212

1313
[[tool.mypy.overrides]]
14-
module = 'latexcodec.*'
14+
module = 'pylatexenc.*'
1515
ignore_missing_imports = true
1616

1717
[[tool.mypy.overrides]]
@@ -59,7 +59,6 @@ classifiers = [
5959
[tool.poetry.dependencies]
6060
python = ">=3.10,<3.11.0 || >3.11.0,<3.13"
6161
docopt = "^0.6.2"
62-
latexcodec = "^2.0.1"
6362
lxml = "^4.9.2"
6463
PyYAML = "^6.0"
6564
app-paths = "^0.0.7"
@@ -77,6 +76,7 @@ gitpython = "^3.1.37"
7776
rnc2rng = "^2.6.6"
7877
citeproc-py = "^0.6.0"
7978
langcodes = {extras = ["data"], version = "^3.5.0"}
79+
pylatexenc = "^2.10"
8080

8181
[tool.poetry.group.dev.dependencies]
8282
black = "^24.3.0"

python/tests/collections/paper_test.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def test_paper_roundtrip_xml(xml):
187187
True,
188188
"""@inproceedings{alvarez-mellado-lignos-2022-detecting,
189189
title = "Detecting Unassimilated Borrowings in {S}panish: {A}n Annotated Corpus and Approaches to Modeling",
190-
author = "\\'Alvarez-Mellado, Elena and
190+
author = "{\\'A}lvarez-Mellado, Elena and
191191
Lignos, Constantine",
192192
editor = "Muresan, Smaranda and
193193
Nakov, Preslav and
@@ -200,15 +200,15 @@ def test_paper_roundtrip_xml(xml):
200200
url = "https://aclanthology.org/2022.acl-long.268/",
201201
doi = "10.18653/v1/2022.acl-long.268",
202202
pages = "3868--3888",
203-
abstract = "This work presents a new resource for borrowing identification and analyzes the performance and errors of several models on this task. We introduce a new annotated corpus of Spanish newswire rich in unassimilated lexical borrowings---words from one language that are introduced into another without orthographic adaptation---and use it to evaluate how several sequence labeling models (CRF, BiLSTM-CRF, and Transformer-based models) perform. The corpus contains 370,000 tokens and is larger, more borrowing-dense, OOV-rich, and topic-varied than previous corpora available for this task. Our results show that a BiLSTM-CRF model fed with subword embeddings along with either Transformer-based embeddings pretrained on codeswitched data or a combination of contextualized word embeddings outperforms results obtained by a multilingual BERT-based model."
203+
abstract = "This work presents a new resource for borrowing identification and analyzes the performance and errors of several models on this task. We introduce a new annotated corpus of Spanish newswire rich in unassimilated lexical borrowings{---}words from one language that are introduced into another without orthographic adaptation{---}and use it to evaluate how several sequence labeling models (CRF, BiLSTM-CRF, and Transformer-based models) perform. The corpus contains 370,000 tokens and is larger, more borrowing-dense, OOV-rich, and topic-varied than previous corpora available for this task. Our results show that a BiLSTM-CRF model fed with subword embeddings along with either Transformer-based embeddings pretrained on codeswitched data or a combination of contextualized word embeddings outperforms results obtained by a multilingual BERT-based model."
204204
}""",
205205
),
206206
(
207207
"2022.acl-long.268",
208208
False,
209209
"""@inproceedings{alvarez-mellado-lignos-2022-detecting,
210210
title = "Detecting Unassimilated Borrowings in {S}panish: {A}n Annotated Corpus and Approaches to Modeling",
211-
author = "\\'Alvarez-Mellado, Elena and
211+
author = "{\\'A}lvarez-Mellado, Elena and
212212
Lignos, Constantine",
213213
editor = "Muresan, Smaranda and
214214
Nakov, Preslav and
@@ -243,7 +243,7 @@ def test_paper_roundtrip_xml(xml):
243243
True,
244244
"""@article{oshaughnessy-1989-parsing,
245245
title = "Parsing with a Small Dictionary for Applications such as Text to Speech",
246-
author = "O'Shaughnessy, Douglas D.",
246+
author = "O{'}Shaughnessy, Douglas D.",
247247
editor = "Allen, James F.",
248248
journal = "Computational Linguistics",
249249
volume = "15",

python/tests/people/name_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,4 +228,4 @@ def test_name_from_any():
228228

229229
def test_name_as_bibtex():
230230
n1 = Name.from_string("André Rieu")
231-
assert n1.as_bibtex() == "Rieu, Andr\\'e"
231+
assert n1.as_bibtex() == "Rieu, Andr{\\'e}"

python/tests/text/markuptext_test.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
{
6363
"text": "Workshop on Topic A & B",
6464
"html": "Workshop on Topic A &amp; B",
65-
"latex": "Workshop on Topic A \\& B",
65+
"latex": "Workshop on Topic A {\\&} B",
6666
},
6767
),
6868
(
@@ -110,9 +110,31 @@
110110
{
111111
"text": "Äöøéÿőßû–",
112112
"html": "Äöøéÿőßû–",
113-
# this is what the modified latexcodec from the acl-anthology repo produces:
114-
# "latex": '{\\"A}{\\"o}{\\o}{\\\'e}{\\"y}{\\H{o}}{\\ss}{\\^u}{--}',
115-
"latex": '\\"A\\"o\\o \\\'e\\"y\\H o\\ss \\^u--',
113+
"latex": '{\\"A}{\\"o}{\\o}{\\\'e}{\\"y}{\\H{o}}{\\ss}{\\^u}{--}',
114+
},
115+
),
116+
(
117+
"Hajič, Jan and Woźniak, Michał",
118+
{
119+
"text": "Hajič, Jan and Woźniak, Michał",
120+
"html": "Hajič, Jan and Woźniak, Michał",
121+
"latex": "Haji{\\v{c}}, Jan and Wo{\\'z}niak, Micha{\\l}",
122+
},
123+
),
124+
(
125+
"Žabokrtský, Zdeněk and Ševčíková, Magda",
126+
{
127+
"text": "Žabokrtský, Zdeněk and Ševčíková, Magda",
128+
"html": "Žabokrtský, Zdeněk and Ševčíková, Magda",
129+
"latex": "{\\v{Z}}abokrtsk{\\'y}, Zden{\\v{e}}k and {\\v{S}}ev{\\v{c}}{\\'i}kov{\\'a}, Magda",
130+
},
131+
),
132+
(
133+
"íìïîı ÍÌÏÎİ",
134+
{
135+
"text": "íìïîı ÍÌÏÎİ",
136+
"html": "íìïîı ÍÌÏÎİ",
137+
"latex": "{\\'i}{\\`i}{\\\"i}{\\^i}{\\i} {\\'I}{\\`I}{\\\"I}{\\^I}{\\.I}",
116138
},
117139
),
118140
(

python/tests/utils/latex_test.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
from acl_anthology.utils import latex
1919

2020
test_cases_latex = (
21-
('"This is a quotation."', "``This is a quotation.''"),
22-
('This is a "quotation".', "This is a ``quotation''."),
23-
('Can you "please" "convert" this?', "Can you ``please'' ``convert'' this?"),
24-
('My name is "陳大文".', "My name is ``陳大文''."),
21+
("{''}This is a quotation.{''}", "``This is a quotation.''"),
22+
("This is a {''}quotation{''}.", "This is a ``quotation''."),
23+
("Can you 'please' {'}convert{'} this?", "Can you `please' `convert' this?"),
24+
("My name is ''陳大文''.", "My name is ``陳大文''."),
2525
)
2626

2727

@@ -36,7 +36,8 @@ def test_namespecs_to_bibtex():
3636
assert latex.namespecs_to_bibtex([]) == ""
3737
assert latex.namespecs_to_bibtex([ns1]) == "Chan, Tai Man"
3838
assert (
39-
latex.namespecs_to_bibtex([ns1, ns2]) == "Chan, Tai Man and\n Do\\'e, John"
39+
latex.namespecs_to_bibtex([ns1, ns2])
40+
== "Chan, Tai Man and\n Do{\\'e}, John"
4041
)
4142

4243

@@ -66,18 +67,18 @@ def test_make_bibtex_entry():
6667
("editor", []),
6768
("title", MarkupText.from_string("Thé Papér")),
6869
("booktitle", MarkupText.from_string('My "Conference"')),
69-
("address", '"Montréal"'),
70+
("address", "Montréal"),
7071
("doi", "10.000.a_b_c"),
7172
("publisher", ""),
7273
("month", "February"),
7374
("note", None),
7475
("pages", "1–7"),
7576
]
7677
expected = """@inproceedings{my-entry,
77-
author = "Do\\'e, John",
78-
title = "Th\\'e Pap\\'er",
78+
author = "Do{\\'e}, John",
79+
title = "Th{\\'e} Pap{\\'e}r",
7980
booktitle = "My ``Conference''",
80-
address = {"Montr\\'eal"},
81+
address = "Montr{\\'e}al",
8182
doi = "10.000.a_b_c",
8283
month = feb,
8384
pages = "1--7"

0 commit comments

Comments
 (0)