-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdataset.py
94 lines (73 loc) · 2.42 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from huggingface_hub import hf_hub_download
import zipfile
import json
from tqdm import tqdm
import os
def convert_json(qa_dict,img_dict):
llava_data_list = []
for _, qa in tqdm(qa_dict.items()):
img_id = str(qa['image_id'])
image = img_dict[img_id]
ques = qa['question']
ans = qa['answer']
llava_data_list.append(
{
"id": img_id,
"image": image,
"conversations": [
{
"from": "human",
"value": ques
},
{
"from": "gpt",
"value": ans
}
]
}
)
return llava_data_list
def load_image_zip(file_name, zip_file_path,extract_folder_path):
hf_hub_download(
repo_id='uitnlp/OpenViVQA-dataset',
repo_type='dataset',
filename=file_name,# 'train-images.zip',
local_dir='.'
)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall(extract_folder_path)
def load_json(file_name,file_path, file_name_json):
hf_hub_download(
repo_id='uitnlp/OpenViVQA-dataset',
repo_type='dataset',
filename=file_name, #'vlsp2023_train_data.json',
local_dir='.'
)
with open(file_path, 'r') as file:
data = json.load(file)
img_dict = data['images']
qa_dict = data['annotations']
llava_data_list=convert_json(qa_dict,img_dict)
json_output_path = os.path.join('./', file_name_json)
with open(json_output_path, 'w', encoding="utf8") as json_file:
json.dump(llava_data_list, json_file, indent=4, ensure_ascii=False)
####### train_image
file_name='train-images.zip'
zip_file_path = './train-images.zip'
extract_folder_path = './train-images'
load_image_zip(file_name, zip_file_path,extract_folder_path)
######## dev_image
file_name='dev-images.zip'
zip_file_path = './dev-images.zip'
extract_folder_path = './dev-images'
load_image_zip(file_name, zip_file_path,extract_folder_path)
####### train_json
file_name='vlsp2023_train_data.json'
file_path = './vlsp2023_train_data.json'
file_name_json='train_dataset.json'
load_json(file_name,file_path, file_name_json)
###### dev_json
file_name='vlsp2023_dev_data.json'
file_path = './vlsp2023_dev_data.json'
file_name_json='eval_dataset.json'
load_json(file_name,file_path, file_name_json)