-
Notifications
You must be signed in to change notification settings - Fork 13
/
data_utils.py
226 lines (176 loc) · 9.89 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np
import os
import json
import pickle
from tqdm import tqdm
def dowload_MS_COCO_raw_data(download_folder=None):
"""
data from http://cocodataset.org/#home
:return: annotation_file (json file path) ...captions_train2014.json
img_file_dir (raw image file folder) ... train2014
"""
if download_folder is not None and os.path.exists(download_folder):
annotation_file = download_folder + '/annotations/captions_train2014.json'
img_file_dir = download_folder + '/train2014'
return annotation_file, img_file_dir
annotation_zip = tf.keras.utils.get_file('captions.zip',
cache_subdir=os.path.abspath('.'),
origin='http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
extract=True)
annotation_file = os.path.dirname(annotation_zip) + '/annotations/captions_train2014.json'
name_of_zip = 'train2014.zip'
if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):
image_zip = tf.keras.utils.get_file(name_of_zip,
cache_subdir=os.path.abspath('.'),
origin='http://images.cocodataset.org/zips/train2014.zip',
extract=True)
img_file_dir = os.path.dirname(image_zip) + '/train2014/'
else:
img_file_dir = os.path.abspath('.') + '/train2014/'
return annotation_file, img_file_dir
def read_raw_image_and_caption_file(annotation_file, img_file_dir, num_examples=None):
# Read the json file
with open(annotation_file, 'r') as f:
annotations = json.load(f)
# Store captions and image names in vectors
all_captions = []
all_img_name_vector = []
for annot in annotations['annotations']:
caption = '<start> ' + annot['caption'] + ' <end>'
image_id = annot['image_id']
full_coco_image_path = os.path.join(img_file_dir, 'COCO_train2014_' + '%012d.jpg' % (image_id))
all_img_name_vector.append(full_coco_image_path)
all_captions.append(caption)
# Shuffle captions and image_names together
# Set a random state
train_captions, img_name_vector = shuffle(all_captions, all_img_name_vector, random_state=1)
# Select the first num_examples captions from the shuffled set, None for all data
train_captions = train_captions[:num_examples]
img_name_vector = img_name_vector[:num_examples]
print(f"train_captions numbers {len(train_captions)}\t img_name_vector numbers {len(img_name_vector)}")
return train_captions, img_name_vector
def initialize_InceptionV3_image_features_extract_model():
"""
:return: InceptionV3 model instance
"""
image_model = tf.keras.applications.InceptionV3(include_top=False,
weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output # shape [batch_size, 8, 8, 2048]
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)
return image_features_extract_model
def load_image(image_path):
img = tf.io.read_file(image_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, (299, 299))
img = tf.keras.applications.inception_v3.preprocess_input(img)
return img, image_path
def caching_image_features_extracted_from_InceptionV3(img_name_vector, cache_image_dir="cache_image"):
"""
:param img_name_vector: image file path list
:param cache_image_dir: folder of store image features extracted from Inception V3
:return:
"""
if not os.path.exists(cache_image_dir):
os.mkdir(cache_image_dir)
# Get unique images
encode_train = sorted(set(img_name_vector))
# Feel free to change batch_size according to your system configuration
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(
load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)
image_features_extract_model = initialize_InceptionV3_image_features_extract_model()
for img, path in tqdm(image_dataset):
# shape [batch_size, 8, 8 ,2048]
batch_features = image_features_extract_model(img)
# shape [batch_size, 64, 2048]
batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))
for batch_features, img_path in zip(batch_features, path):
path_of_feature = img_path.numpy().decode("utf-8")
path_of_feature_file_name = os.path.basename(path_of_feature)
save_file_path_of_feature = os.path.join(cache_image_dir, path_of_feature_file_name)
np.save(save_file_path_of_feature, batch_features.numpy())
def preprocess_and_tokenize_captions(train_captions, top_k):
"""
:param train_captions: captions list
:param top_k: limit the vocabulary size to the top k words (to save memory)
:return: caption_vector (token and pad)
"""
# Find the maximum length of any caption in our dataset
def calc_max_length(tensor):
return max(len(t) for t in tensor)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, oov_token="<unk>",
filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'
with open("caption_tokenizer", "wb") as f:
pickle.dump(tokenizer, f)
# Create the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(train_captions)
# Pad each vector to the max_length of the captions
# If you do not provide a max_length value, pad_sequences calculates it automatically
caption_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')
# Calculates the max_length, which is used to store the attention weights
max_length = calc_max_length(train_seqs)
print(f"max_length:\t{max_length}")
return caption_vector
def split_and_save_raw_image_path_and_caption_content(img_name_vector, caption_vector, test_size,
raw_image_path_and_caption_content):
save_dir_name = raw_image_path_and_caption_content
if not os.path.exists(save_dir_name):
os.mkdir(save_dir_name)
# Create training and validation sets using an 80-20 split
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, caption_vector,
test_size=test_size, random_state=0)
print(f"img_name_train numbers:\t{len(img_name_train)}\tcap_train numbers:\t{len(cap_train)}\n"
f"img_name_val numbers:\t {len(img_name_val)}\tcap_val numbers:\t{len(cap_val)}")
np.save(save_dir_name + '/img_name_train.npy', img_name_train)
np.save(save_dir_name + '/img_name_val.npy', img_name_val)
np.save(save_dir_name + '/cap_train.npy', cap_train)
np.save(save_dir_name + '/cap_val.npy', cap_val)
print(f"Prepared file save to {save_dir_name}")
def get_caption_tokenizer(caption_tokenizer_path="caption_tokenizer"):
with open(caption_tokenizer_path, "rb") as f:
tokenizer = pickle.load(f)
return tokenizer
def load_raw_image_path_and_caption_content(save_dir_name="raw_image_path_and_caption_content"):
if not os.path.exists(save_dir_name):
raise ValueError(f"Not found {save_dir_name}, please first run prepare_MS-COCO_data.py")
img_name_train = np.load(save_dir_name + '/img_name_train.npy')
img_name_val = np.load(save_dir_name + '/img_name_val.npy')
cap_train = np.load(save_dir_name + '/cap_train.npy')
cap_val = np.load(save_dir_name + '/cap_val.npy')
return img_name_train, img_name_val, cap_train, cap_val
def main(download_folder=None, num_examples=None, top_k=5000, test_size=0.2,
raw_image_path_and_caption_content="raw_image_path_and_caption_content", cache_image_dir="cache_image"):
"""
:param download_folder: str, None->Automatic download of files
:param num_examples: int, Select the first 30000 captions from the shuffled set, None for all data
:param top_k: int, Choose the top 5000 words from the vocabulary
:param test_size: float, test data number : train data number = test_size : 1
:param cache_image_path_and_tokened_caption_file_dir: str, Store processed file paths
:param cache_image_dir: str, Store processed image file dir
:return:
"""
# Download files
annotation_file, img_file_dir = dowload_MS_COCO_raw_data(download_folder=download_folder)
# read_raw_image_and_caption_file
train_captions, img_name_vector = read_raw_image_and_caption_file(annotation_file, img_file_dir,
num_examples=num_examples)
# Preprocess the images and captions
caption_vector = preprocess_and_tokenize_captions(train_captions, top_k)
caching_image_features_extracted_from_InceptionV3(img_name_vector, cache_image_dir=cache_image_dir)
# split_and_save file
split_and_save_raw_image_path_and_caption_content(img_name_vector, caption_vector, test_size=test_size,
raw_image_path_and_caption_content=raw_image_path_and_caption_content)
if __name__ == "__main__":
download_folder = "/home/b418a/disk1/pycharm_room/yuanxiao/my_lenovo_P50s/Image_captioning"
raw_image_path_and_caption_content_dir = "raw_image_path_and_caption_content"
cache_image_dir = "cache_image"
main(download_folder=download_folder, num_examples=10000, top_k=3000, test_size=0.2,
raw_image_path_and_caption_content=raw_image_path_and_caption_content_dir,
cache_image_dir=cache_image_dir)