-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibtex.py
90 lines (78 loc) · 3.17 KB
/
bibtex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#! /usr/bin/python
# -*- coding: utf-8 -*-
"""This module is used for parsing BibTeX entries.
The goal of this code is to parse BibTeX entries from a number of known sites
with known BibTeX formats; therefore many other BiBteX entries may be parsed
incorrectly or incompletely as TeX system is very complex and this module is
not intended to parse TeX.
Some of the known issues:
* Currently it does not detect special symbols and many TeX escape
sequences (more information: http://www.bibtex.org/SpecialSymbols/)
* String concatinatins are not recognized. (e.g. "str1" # "str2")
* Abbreviations are not supported (e.g. @string { foo = "Mrs. Foo" })
"""
import re
import commons
def parse(bibtex):
"""Parse bibtex string and return a dictionary of information."""
bibtex = replace_specials(bibtex)
d = search_for_tag(bibtex)
# type: (book, journal, . . . )
m = re.search('@(.*?)\s*\{', bibtex, re.I)
if m:
d['type'] = m.group(1).strip().lower()
# author
if 'author' in d:
d['authors'] = []
for author in d['author'].split(' and '):
name = commons.Name(author)
d['authors'].append(name)
del d['author']
if 'number' in d:
d['issue'] = d['number']
del d['number']
if 'pages' in d:
d['pages'] = d['pages'].replace('--', '–')
d['pages'] = d['pages'].replace('-', '–')
if '–' in d['pages']:
d['startpage'], d['endpage'] = d['pages'].split('–')
return d
def search_for_tag(bibtex):
"""Find all tags in the bibtex and return result as a dictionary."""
d = {}
fs = re.findall('(\w+)\s*=\s*(?:[{"]\s*(.*?)\s*["}]|(\d+))', bibtex)
for f in fs:
d[f[0].lower()] = f[1] if f[1] else f[2]
return d
def replace_specials(bibtex):
"""Replace common TeX special symbol commonds with their unicode value."""
bibtex = bibtex.replace(r'{\textregistered}', '®')
bibtex = bibtex.replace(r'\%', '%')
bibtex = bibtex.replace(r'\$', '$')
bibtex = bibtex.replace(r'\{', '{')
bibtex = bibtex.replace(r'\}', '}')
bibtex = bibtex.replace(r'\#', '#')
bibtex = bibtex.replace(r'\&', '&')
bibtex = bibtex.replace(r'{\={a}}', 'ā')
bibtex = bibtex.replace(r'{\v{c}}', 'č')
bibtex = bibtex.replace(r'{\={e}}', 'ē')
bibtex = bibtex.replace(r'{\v{g}}', 'ģ')
bibtex = bibtex.replace(r'{\={\i}}', 'ī')
bibtex = bibtex.replace(r'{\c{k}}', 'ķ')
bibtex = bibtex.replace(r'{\c{l}}', 'ļ')
bibtex = bibtex.replace(r'{\c{n}}', 'ņ')
bibtex = bibtex.replace(r'{\v{s}}', 'š')
bibtex = bibtex.replace(r'{\={u}}', 'ū')
bibtex = bibtex.replace(r'{\v{z}}', 'ž')
bibtex = bibtex.replace(r'{\={A}}', 'Ā')
bibtex = bibtex.replace(r'{\v{C}}', 'Č')
bibtex = bibtex.replace(r'{\={E}}', 'Ē')
bibtex = bibtex.replace(r'{\c{G}}', 'Ģ')
bibtex = bibtex.replace(r'{\={I}}', 'Ī')
bibtex = bibtex.replace(r'{\c{K}}', 'Ķ')
bibtex = bibtex.replace(r'{\c{L}}', 'Ļ')
bibtex = bibtex.replace(r'{\c{N}}', 'Ņ')
bibtex = bibtex.replace(r'{\v{S}}', 'Š')
bibtex = bibtex.replace(r'{\={U}}', 'Ū')
bibtex = bibtex.replace(r'{\v{Z}}', 'Ž')
return bibtex