forked from humenda/isolang-rs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild.rs
188 lines (160 loc) · 6.83 KB
/
build.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
extern crate phf_codegen;
use std::env;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;
use std::collections::HashMap;
// Taken from http://www-01.sil.org/iso639-3/download.asp
static ISO_TABLE_PATH: &'static str = "iso-639-3.tab";
// Local names of languages from https://github.com/bbqsrc/iso639-autonyms
static AUTONYMS_TABLE_PATH: &'static str = "iso639-autonyms.tsv";
pub struct Language {
english: String,
local: Option<String>,
}
/// This contains (639-3, 639-1, Name, comment)
type LangCodes = Vec<(String, Option<String>, Language, Option<String>)>;
/// convert first character to upper case
fn title(s: &str) -> String {
let mut v: Vec<char> = s.chars().collect();
v[0] = v[0].to_uppercase().nth(0).unwrap();
v.into_iter().collect::<String>()
}
// parse autonym table
fn read_autonyms_table() -> HashMap<String, Option<String>> {
let r = BufReader::new(File::open(AUTONYMS_TABLE_PATH).expect(r"\
Couldn't read autonyms table tsv. Make sure that this operation is run from \
the crate source root and that this file actually exists.",
));
r.lines()
.skip(1)
.map(|line| {
let line = line.expect("Couldn't read from autonyms table, please \
check that the file exists and is readable");
let cols = line.split("\t").collect::<Vec<&str>>();
let three_letter: String = cols[0].into();
let autonym: Option<String> = match cols[3].len() {
0 => None,
_ => Some(cols[3].into()),
};
(three_letter, autonym)
})
.collect()
}
/// parse ISO 6639-(3,1) table
fn read_iso_table() -> LangCodes {
let autonyms_table = read_autonyms_table();
let r = BufReader::new(File::open(ISO_TABLE_PATH).expect(r"\
Couldn't read iso-639-3.tab. Make sure that this operation is run from \
the crate source root and that this file actually exists.",
));
r.lines()
.skip(1)
.map(|line| {
let line = line.expect("Couldn't read from ISO 639 table, please \
check that the file exists and is readable");
let cols = line.split("\t").collect::<Vec<&str>>();
let three_letter: String = cols[0].into();
let two_letter: Option<String> = match cols[3].len() {
2 => Some(cols[3].into()),
_ => None,
};
let autonym = &autonyms_table[&three_letter];
// split language string into name and comment, if required
match cols[6].contains("(") {
false => (three_letter, two_letter, Language {
english: cols[6].into(),
local: autonym.to_owned(),
}, None),
true => match cols[6].split(" (").collect::<Vec<&str>>() {
ref m if m.len() != 2 => (three_letter, two_letter, Language {
english: cols[6].into(),
local: autonym.to_owned(),
}, None),
m => (three_letter, two_letter, Language {
english: m[0].into(),
local: autonym.to_owned(),
}, Some(m[1].into())),
},
}
})
.collect()
}
/// write static array with (639-3, 639-1, english name, comment) entries
fn write_overview_table(file: &mut BufWriter<File>, codes: &LangCodes) {
if cfg!(feature = "local_names") {
writeln!(file, "static OVERVIEW: [([u8; 3], Option<&'static [u8; 2]>, \
&'static [u8], Option<&'static [u8]>, Option<&'static [u8]>); {}] = [", codes.len())
.unwrap();
} else {
writeln!(file, "static OVERVIEW: [([u8; 3], Option<&'static [u8; 2]>, \
&'static [u8], Option<&'static [u8]>); {}] = [", codes.len())
.unwrap();
}
for ref language in codes.iter() {
write!(file, " ({:?}, ", language.0.as_bytes()).unwrap();
match language.1 {
Some(ref val) => write!(file, "Some(&{:?}), ", val.as_bytes()).unwrap(),
None => write!(file, "None, ").unwrap(),
}
write!(file, "&{:?}, ", language.2.english.as_bytes()).unwrap();
if cfg!(feature = "local_names") {
match language.2.local {
Some(ref val) => write!(file, "Some(&{:?}), ", val.as_bytes()).unwrap(),
None => write!(file, "None, ").unwrap(),
}
}
match language.3 {
Some(ref comment) => writeln!(file, "Some(&{:?})),", comment.as_bytes()).unwrap(),
None => writeln!(file, "None),").unwrap(),
};
}
write!(file, "];\n").unwrap();
}
/// Write a mapping of codes from 639-1 -> Language::`639-3`
fn write_two_letter_to_enum(file: &mut BufWriter<File>, codes: &LangCodes) {
write!(file, "static TWO_TO_THREE: phf::Map<&'static str, Language> = ")
.unwrap();
let mut map = phf_codegen::Map::new();
for &(ref id, ref two_letter, _, _) in codes.iter() {
if let &Some(ref two_letter) = two_letter {
map.entry(two_letter.clone(), &format!("Language::{}", title(id)));
}
}
map.build(file).unwrap();
writeln!(file, ";").unwrap();
}
/// Write a mapping of codes from 639-3 -> Language::`639-3`
fn write_three_letter_to_enum(file: &mut BufWriter<File>, codes: &LangCodes) {
write!(file, "static THREE_TO_THREE: phf::Map<&'static str, Language> = ")
.unwrap();
let mut map = phf_codegen::Map::new();
for &(ref id, _, _, _) in codes.iter() {
map.entry(id.clone(), &format!("Language::{}", title(id)));
}
map.build(file).unwrap();
writeln!(file, ";").unwrap();
}
fn main() {
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("isotable.rs");
let codes = read_iso_table();
{
// make output file live shorter than codes
let mut file = BufWriter::new(File::create(&path).expect(r"Couldn't \
write to output directory, compilation impossible"));
// write overview table with all data
write_overview_table(&mut file, &codes);
// write enum with 639-3 codes (num is the index into the overview table)
writeln!(&mut file, "#[derive(Clone, Copy, Hash, Eq, PartialEq)]").unwrap();
writeln!(&mut file, "pub enum Language {{").unwrap();
for (num, &(ref id, _, _, _)) in codes.iter().enumerate() {
writeln!(&mut file, " #[doc(hidden)]").unwrap();
writeln!(&mut file, " {} = {},", title(id), num).unwrap();
}
writeln!(&mut file, "}}\n\n").unwrap();
// write map 639-1 -> enum mapping
write_two_letter_to_enum(&mut file, &codes);
// write map 639-3 -> enum mapping
write_three_letter_to_enum(&mut file, &codes);
}
}