-
Notifications
You must be signed in to change notification settings - Fork 6
/
splitnames.py
294 lines (251 loc) · 10.6 KB
/
splitnames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import bibtexparser as bp
def split_latex_to_sections(
latex_string: str, strict_mode=True
) -> tuple[list[list[str]], list[list[int]]]:
"""
Split the given latex string into sections.
Returns two lists of lists. Each list on the first of those two lists contains the words of a section.
Each list on the second of those two lists contains the case of each word in the corresponding section: 1 = uppercase, 0 = lowercase, -1 = caseless.
"""
whitespace = set(" ~\r\n\t")
# We'll iterate over the input once, dividing it into a list of words for
# each comma-separated section. We'll also calculate the case of each word
# as we work.
sections = [[]] # Sections of the name.
cases = [[]] # 1 = uppercase, 0 = lowercase, -1 = caseless.
word = [] # Current word.
case = -1 # Case of the current word.
level = 0 # Current brace level.
bracestart = False # Will the next character be the first within a brace?
controlseq = True # Are we currently processing a control sequence?
specialchar = None # Are we currently processing a special character?
# Using an iterator allows us to deal with escapes in a simple manner.
nameiter = iter(latex_string)
for char in nameiter:
# An escape.
if char == "\\":
escaped = next(nameiter)
# BibTeX doesn't allow whitespace escaping. Copy the slash and fall
# through to the normal case to handle the whitespace.
if escaped in whitespace:
word.append(char)
char = escaped
else:
# Is this the first character in a brace?
if bracestart:
bracestart = False
controlseq = escaped.isalpha()
specialchar = True
# Can we use it to determine the case?
elif (case == -1) and escaped.isalpha():
if escaped.isupper():
case = 1
else:
case = 0
# Copy the escape to the current word and go to the next
# character in the input.
word.append(char)
word.append(escaped)
continue
# Start of a braced expression.
if char == "{":
level += 1
word.append(char)
bracestart = True
controlseq = False
specialchar = False
continue
# All the below cases imply this (and don't test its previous value).
bracestart = False
# End of a braced expression.
if char == "}":
# Check and reduce the level.
if level:
level -= 1
else:
if strict_mode:
raise bp.customization.InvalidName(
"Unmatched closing brace in name {{{0}}}.".format(name)
)
word.insert(0, "{")
# Update the state, append the character, and move on.
controlseq = False
specialchar = False
word.append(char)
continue
# Inside a braced expression.
if level:
# Is this the end of a control sequence?
if controlseq:
if not char.isalpha():
controlseq = False
# If it's a special character, can we use it for a case?
elif specialchar:
if (case == -1) and char.isalpha():
if char.isupper():
case = 1
else:
case = 0
# Append the character and move on.
word.append(char)
continue
# End of a word.
# NB. we know we're not in a brace here due to the previous case.
if char == "," or char in whitespace:
# Don't add empty words due to repeated whitespace.
if word:
sections[-1].append("".join(word))
word = []
cases[-1].append(case)
case = -1
controlseq = False
specialchar = False
# End of a section.
if char == ",":
if len(sections) < 3:
sections.append([])
cases.append([])
elif strict_mode:
raise bp.customization.InvalidName(
"Too many commas in the name {{{0}}}.".format(name)
)
continue
# Regular character.
word.append(char)
if (case == -1) and char.isalpha():
if char.isupper():
case = 1
else:
case = 0
# Unterminated brace?
if level:
if strict_mode:
raise bp.customization.InvalidName(
"Unterminated opening brace in the name {{{0}}}.".format(name)
)
while level:
word.append("}")
level -= 1
# Handle the final word.
if word:
sections[-1].append("".join(word))
cases[-1].append(case)
return sections, cases
def splitname(name, strict_mode=True):
"""
Break a name into its constituent parts: First, von, Last, and Jr.
:param string name: a string containing a single name
:param Boolean strict_mode: whether to use strict mode
:returns: dictionary of constituent parts
:raises `customization.InvalidName`: If an invalid name is given and
``strict_mode = True``.
In BibTeX, a name can be represented in any of three forms:
* First von Last
* von Last, First
* von Last, Jr, First
This function attempts to split a given name into its four parts. The
returned dictionary has keys of ``first``, ``last``, ``von`` and ``jr``.
Each value is a list of the words making up that part; this may be an empty
list. If the input has no non-whitespace characters, a blank dictionary is
returned.
It is capable of detecting some errors with the input name. If the
``strict_mode`` parameter is ``True``, which is the default, this results in
a :class:`customization.InvalidName` exception being raised. If it is
``False``, the function continues, working around the error as best it can.
The errors that can be detected are listed below along with the handling
for non-strict mode:
* Name finishes with a trailing comma: delete the comma
* Too many parts (e.g., von Last, Jr, First, Error): merge extra parts
into Last. The second part is merged into First if it is an initial.
* Unterminated opening brace: add closing brace to end of input
* Unmatched closing brace: add opening brace at start of word
"""
# Modified from the bibtexparser.customization.splitname function to merge into Last instead of First.
# The ``von`` part is ignored unless commans are used as separators. Note that ``von`` part colides with uncapitalized parts of the last name.
# Useful references:
# http://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html#names
# http://tug.ctan.org/info/bibtex/tamethebeast/ttb_en.pdf
# Group names of exceptional cases.
# if " ".join(name.split()) in data.GROUPING_NAMES:
# name_parts = [word.strip() for word in data.GROUPING_NAMES[name].split("|")]
# name_parts = name_parts[1:] + name_parts[:1]
# name = ",".join(name_parts)
sections, cases = split_latex_to_sections(name, strict_mode)
# Get rid of trailing sections.
if not sections[-1]:
# Trailing comma?
if (len(sections) > 1) and strict_mode:
raise bp.customization.InvalidName(
"Trailing comma at end of name {{{0}}}.".format(name)
)
sections.pop(-1)
cases.pop(-1)
# No non-whitespace input.
if not sections or not any(bool(section) for section in sections):
return {}
# Initialise the output dictionary.
parts = {"first": [], "last": [], "von": [], "jr": []}
# Form 1: "First von Last"
# print(f"{sections=}")
# print(cases)
if len(sections) == 1:
p0 = sections[0]
cases = cases[0]
# One word only: last cannot be empty.
if len(p0) == 1:
parts["last"] = p0
# Two words: must be first and last.
elif len(p0) == 2:
parts["first"] = p0[:1]
parts["last"] = p0[1:]
# Need to use the cases to figure it out.
elif len(p0) > 2 and p0[1][1] == ".":
parts["first"] = p0[:2]
parts["last"] = p0[2:]
else:
num_capitals = sum(cases)
if num_capitals > 2:
capital_position = [i for i, e in enumerate(cases) if e]
third_to_last_captilized = capital_position[-3]
second_to_last_captilized = capital_position[-2]
parts["first"] = p0[: third_to_last_captilized + 1]
parts["von"] = p0[
third_to_last_captilized + 1 : second_to_last_captilized
]
parts["last"] = p0[second_to_last_captilized:]
else:
parts["first"] = p0[:1]
parts["last"] = p0[1:]
# Form 2 ("von Last, First") or 3 ("von Last, jr, First")
else:
# As long as there is content in the first name partition, use it as-is.
first = sections[-1]
if first and first[0]:
parts["first"] = first
# And again with the jr part.
if len(sections) == 3:
jr = sections[-2]
if jr and jr[0]:
parts["jr"] = jr
# Last name cannot be empty; if there is only one word in the first
# partition, we have to use it for the last name.
last = sections[0]
if len(last) == 1:
parts["last"] = last
# Have to look at the cases to figure it out.
else:
lcases = cases[0]
# At least one lowercase: von is the longest sequence of whitespace
# separated words whose last word does not start with an uppercase
# word, and last is the rest.
if 0 in lcases:
split = len(lcases) - lcases[::-1].index(0)
if split == len(lcases):
split = 0 # Last cannot be empty.
parts["von"] = sections[0][:split]
parts["last"] = sections[0][split:]
# All uppercase => all last.
else:
parts["last"] = sections[0]
# Done.
return parts