-
Notifications
You must be signed in to change notification settings - Fork 3
/
bin_to_tsv.py
61 lines (49 loc) · 1.97 KB
/
bin_to_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
Copyright (c) 2019 Emil Lynegaard
Distributed under the MIT software license, see the
accompanying LICENSE.md or https://opensource.org/licenses/MIT
Converts the tensorflow specific binary input files from See et al. (2017),
to a TSV representation, such that they can be used with Pytorch.
The binary files can be downloaded from:
https://github.com/JafferWilson/Process-Data-of-CNN-DailyMail
Examples:
python -m tools.bin_to_tsv val.bin val.tsv
"""
import sys
import re
import struct
# pylint: disable=no-name-in-module
from tensorflow.core.example import example_pb2
def bin_generator(file):
"""
For parsing the files generated by scripts from See et al. 2017 found at:
https://github.com/abisee/cnn-dailymail
"""
with open(file, "rb") as reader:
while True:
len_bytes = reader.read(8)
if not len_bytes:
break
str_len = struct.unpack("q", len_bytes)[0]
example_str = struct.unpack("%ds" % str_len, reader.read(str_len))[0]
example = example_pb2.Example.FromString(example_str)
article = example.features.feature["article"].bytes_list.value[0].decode()
summary = example.features.feature["abstract"].bytes_list.value[0].decode()
summary = summary.replace("<s>", "")
summary = summary.replace("</s>", "")
yield (article, summary)
def main():
"""Convert binary data used by See et al. 2017, to tsv-file"""
file = sys.argv[1]
out = sys.argv[2]
assert file.endswith(".bin")
with open(out, "w") as f:
for (article, summary) in bin_generator(file):
# remove double spaces, although it won't matter much
# as long as split with .split().
article = re.sub(" +", " ", article).strip()
summary = re.sub(" +", " ", summary).strip()
if article and summary:
print("%s\t%s" % (article, summary), file=f)
if __name__ == "__main__":
main()