-
Notifications
You must be signed in to change notification settings - Fork 2k
/
data_process.py
109 lines (87 loc) · 4.06 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import gdown
import shutil
import json
from zipfile import ZipFile
urls = {
"funcqa": "https://drive.google.com/uc?id=13Sj7uIsyqWXoTh1ejWUviTzeQSES2Omd",
"restbench": "https://raw.githubusercontent.com/Yifan-Song793/RestGPT/main/datasets/tmdb.json",
"toolbench": "https://drive.google.com/uc?id=1XFjDxVZdUY7TXYF2yvzx3pJlS2fy78jk",
}
def read_jsonline(address):
not_mark = []
with open(address, 'r', encoding="utf-8") as f:
for jsonstr in f.readlines():
jsonstr = json.loads(jsonstr)
not_mark.append(jsonstr)
return not_mark
def read_json(address):
with open(address, 'r', encoding='utf-8') as json_file:
json_data = json.load(json_file)
return json_data
def toolbench_process(data_file, dataset):
ls = read_json(data_file)
all_data = read_json(f"{dataset}/tool_instruction/toolbench_tool_instruction.json")
all_dic = {}
for ID in all_data.keys():
all_dic[all_data[ID]["tool_name"]] = all_data[ID]
not_in = []
for data in ls:
Tool_dic = []
data_dic = {}
already = []
for tool in data['api_list']:
if tool['tool_name'] in all_dic:
if all_dic[tool['tool_name']]["ID"] not in already:
already.append(all_dic[tool['tool_name']]["ID"])
Tool_dic.append({"ID": all_dic[tool['tool_name']]["ID"],
"Description": all_dic[tool['tool_name']]["tool_description"], })
data["Tool_dic"] = Tool_dic
json_str = json.dumps(ls, indent=4)
with open(data_file, 'w', encoding='utf-8') as json_file:
json.dump(ls, json_file, ensure_ascii=False, indent=4)
def main():
curr_dir = os.path.dirname(__file__)
for dataset in [
"funcqa",
"restbench",
"toolbench"
]:
dataset_path = os.path.join(curr_dir, "data_{}".format(dataset), "test_data")
if not os.path.exists(dataset_path):
os.makedirs(dataset_path)
if dataset == "funcqa":
print("Processing FuncQA dataset ...\n")
# Processing FuncQA dataset ...
temp_file = os.path.join(dataset_path, "data_toolkengpt_0918.zip")
gdown.download(urls[dataset], temp_file, quiet=False)
zf = ZipFile(temp_file, 'r')
zf.extract("data/funcqa/funcqa_oh.json", ".")
zf.extract("data/funcqa/funcqa_mh.json", ".")
os.rename("data/funcqa/funcqa_oh.json", "{}/funcqa_oh.json".format(dataset_path))
os.rename("data/funcqa/funcqa_mh.json", "{}/funcqa_mh.json".format(dataset_path))
os.remove(temp_file)
shutil.rmtree("data")
print("FuncQA dataset Done!\n")
if dataset == "restbench":
print("Processing RestBench dataset ... \n")
# Processing RestBench Dataset
os.system("wget -P {} -c {}".format(dataset_path, urls[dataset]))
print("RestBench dataset Done!\n")
if dataset == "toolbench":
print("Processing ToolBench dataset ... \n")
# Processing ToolBench Dataset
temp_file = os.path.join(dataset_path, "data.zip")
gdown.download(urls[dataset], temp_file, quiet=False)
zf = ZipFile(temp_file, 'r')
zf.extract("data/test_instruction/G2_category.json", ".")
zf.extract("data/test_instruction/G3_instruction.json", ".")
os.rename("data/test_instruction/G2_category.json", "{}/G2_category.json".format(dataset_path))
os.rename("data/test_instruction/G3_instruction.json", "{}/G3_instruction.json".format(dataset_path))
toolbench_process("{}/G2_category.json".format(dataset_path), "data_{}".format(dataset))
toolbench_process("{}/G3_instruction.json".format(dataset_path), "data_{}".format(dataset))
os.remove(temp_file)
shutil.rmtree("data")
print("Toolbench dataset Done!\n")
if __name__ == '__main__':
main()