-
Notifications
You must be signed in to change notification settings - Fork 10
/
vis_lmdb.py
68 lines (62 loc) · 2.58 KB
/
vis_lmdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
Paper: "UTRNet: High-Resolution Urdu Text Recognition In Printed Documents" presented at ICDAR 2023
Authors: Abdur Rahman, Arjun Ghosh, Chetan Arora
GitHub Repository: https://github.com/abdur75648/UTRNet-High-Resolution-Urdu-Text-Recognition
Project Website: https://abdur75648.github.io/UTRNet/
Copyright (c) 2023-present: This work is licensed under the Creative Commons Attribution-NonCommercial
4.0 International License (http://creativecommons.org/licenses/by-nc/4.0/)
"""
import lmdb, six, random
from PIL import Image
from tqdm import tqdm
data_dir = "lmdb_ihtr/malayalam/train/real/1L_synth"
env = lmdb.open(data_dir, max_readers=32, readonly=True, lock=False, readahead=False, meminit=False)
with env.begin(write=False) as txn:
nSamples = int(txn.get('num-samples'.encode()))
print("Number of samples: ",nSamples)
def get_item(index):
assert index <= nSamples, 'index range error'
with env.begin(write=False) as txn:
# Label
label_key = 'label-%09d'.encode() % index
#print("Label key: ",txn.get(label_key))
label = txn.get(label_key).decode('utf-8')
# Image
img_key = 'image-%09d'.encode() % index
imgbuf = txn.get(img_key)
buf = six.BytesIO()
buf.write(imgbuf)
buf.seek(0)
img = Image.open(buf).convert('RGB')
return (img, label)
for i in range(10):
idx = random.randint(1,nSamples+1)
img, label = get_item(idx)
img.save(str(idx)+".jpg")
with open("samples.txt","a",encoding="utf-8") as f:
f.write(str(idx)+".jpg" + "\t" + label + "\n")
# calculate mean label_length and mean aspected ratio of the samples & also draw histograms of label length and aspected ratio
# mean_label_length = 0
# mean_aspected_ratio = 0
# label_lengths = []
# aspected_ratios = []
# for i in tqdm(range(1,nSamples)):
# img, label = get_item(i)
# mean_label_length += len(label)
# mean_aspected_ratio += img.size[0] / img.size[1]
# label_lengths.append(len(label))
# aspected_ratios.append(img.size[0] / img.size[1])
# mean_label_length /= nSamples
# mean_aspected_ratio /= nSamples
# print("Mean label length: ",mean_label_length)
# print("Max label length: ",max(label_lengths))
# print("Mean aspected ratio: ",mean_aspected_ratio)
# print("Max aspected ratio: ",max(aspected_ratios))
# import matplotlib.pyplot as plt
# plt.hist(label_lengths, bins=20)
# plt.title("Label length histogram")
# plt.savefig("label_length_histogram.png")
# import matplotlib.pyplot as plt
# plt.hist(aspected_ratios, bins=20)
# plt.title("Aspected ratio histogram")
# plt.savefig("aspected_ratio_histogram.png")