Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] MLT Dataset Preparer #1807

Open
wants to merge 2 commits into
base: dev-1.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions dataset_zoo/mlt/metafile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
Name: 'MLT 2017 (ICDAR 2017)'
Paper:
Title: ICDAR2017 Robust Reading Challenge on Multi-Lingual Scene Text Detection and Script Identification - RRC-MLT
URL: https://ieeexplore.ieee.org/document/8270168
Venue: ICDAR
Year: '2017'
BibTeX: '@INPROCEEDINGS{8270168,
author={Nayef, Nibal and Yin, Fei and Bizid, Imen and Choi, Hyunsoo and Feng, Yuan and Karatzas, Dimosthenis and Luo, Zhenbo and Pal, Umapada and Rigaud, Christophe and Chazalon, Joseph and Khlif, Wafa and Luqman, Muhammad Muzzamil and Burie, Jean-Christophe and Liu, Cheng-lin and Ogier, Jean-Marc},
booktitle={2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR)},
title={ICDAR2017 Robust Reading Challenge on Multi-Lingual Scene Text Detection and Script Identification - RRC-MLT},
year={2017},
volume={01},
number={},
pages={1454-1459},
doi={10.1109/ICDAR.2017.237}}'
Data:
Website: https://rrc.cvc.uab.es/?ch=8
Language:
- Arabic
- English
- Chinese
- Japanese
- Korean
- Italian
- German
- Indian
- French
Scene:
- Natural Scene
Granularity:
- Word
Tasks:
- textdet
- textrecog
- textspotting
License:
Type: CC BY 4.0
Link: https://creativecommons.org/licenses/by/4.0/
Format: .txt
20 changes: 20 additions & 0 deletions dataset_zoo/mlt/sample_anno.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
**Text Detection, Text Spotting**

```text
# x1,y1,x2,y2,x3,y3,x4,y4,script,text
# Valid scripts are: "Arabic", "Latin", "Chinese", "Japanese", "Korean", "Bangla", "Symbols", "Mixed", "None"

131,34,181,34,179,47,131,49,Latin,Castle
150,59,194,58,196,72,150,73,Arabic,متحف
90,83,143,83,143,96,91,96,Latin,Heritage
146,81,200,80,201,93,147,94,Latin,Museum
```

**Text Recognition**

```text
# img_name,script,text

word_4.png,Arabic,المكرمة
word_5.png,Latin,MAKKA
```
114 changes: 114 additions & 0 deletions dataset_zoo/mlt/textdet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
data_root = 'data/mlt'
cache_path = 'data/cache'
# yapf: disable
train_preparer = dict(
obtainer=dict(
type='NaiveDataObtainer',
cache_path=cache_path,
files=[
dict(
url='https://datasets.cvc.uab.es/rrc/ch8_training_images_1.zip', # noqa: E501
save_name='mlt_1.zip',
md5='7b26e10d949c00fb4411f40b4f1fce6e',
content=['image'],
mapping=[['mlt_1/*', 'textdet_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/ch8_training_images_2.zip', # noqa: E501
save_name='mlt_2.zip',
md5='e992fb5a7621dd6329081a73e52a28e1',
content=['image'],
mapping=[['mlt_2/*', 'textdet_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/ch8_training_images_3.zip', # noqa: E501
save_name='mlt_3.zip',
md5='044ea5fb1dcec8bbb874391c517b55ff',
content=['image'],
mapping=[['mlt_3/*', 'textdet_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/ch8_training_images_4.zip', # noqa: E501
save_name='mlt_4.zip',
md5='344a657c1cc7cbb150547f1c76b5cc8e',
content=['image'],
mapping=[['mlt_4/*', 'textdet_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/ch8_training_images_5.zip', # noqa: E501
save_name='mlt_5.zip',
md5='5c7ac0158e7189c0a634eaf7bdededc5',
content=['image'],
mapping=[['mlt_5/*', 'textdet_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/ch8_training_images_6.zip', # noqa: E501
save_name='mlt_6.zip',
md5='3b479255a96d255680f51005b5232bac',
content=['image'],
mapping=[['mlt_6/*', 'textdet_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/ch8_training_images_7.zip', # noqa
save_name='mlt_7.zip',
md5='faa033fb9d2922d747bad9b0692c992e',
content=['image'],
mapping=[['mlt_7/*', 'textdet_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/ch8_training_images_8.zip', # noqa
save_name='mlt_8.zip',
md5='db8afa59ae520757151f6ce5acd489ef',
content=['image'],
mapping=[['mlt_8/*', 'textdet_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/'
'ch8_training_localization_transcription_gt_v2.zip',
save_name='mlt_train_gt.zip',
md5='2c9c3de30b5615f6846738bbd336c988',
content=['annotation'],
mapping=[['mlt_train_gt/', 'annotations/train']]),
]),
gatherer=dict(
type='PairGatherer',
img_suffixes=['.jpg', '.JPG'],
rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
parser=dict(
type='ICDARTxtTextDetAnnParser',
encoding='utf-8-sig',
format='x1,y1,x2,y2,x3,y3,x4,y4,lang,trans'),
packer=dict(type='TextDetPacker'),
dumper=dict(type='JsonDumper'),
) # noqa

val_preparer = dict(
obtainer=dict(
type='NaiveDataObtainer',
cache_path=cache_path,
files=[
dict(
url='https://rrc.cvc.uab.es/downloads/ch8_validation_images.zip', # noqa
save_name='mlt_val_img.zip',
md5='3cfc7b440ab81b89a981d707786dbe83',
content=['image'],
mapping=[['mlt_val_img', 'textdet_imgs/val']]),
dict(
url='https://datasets.cvc.uab.es/rrc/'
'ch8_validation_localization_transcription_gt_v2.zip',
save_name='mlt_val_gt.zip',
md5='ecae7d433e6f103bb31e00d37254009c',
content=['annotation'],
mapping=[['mlt_val_gt/', 'annotations/val']]),
]),
gatherer=dict(
type='PairGatherer',
img_suffixes=['.jpg', '.JPG'],
rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
parser=dict(
type='ICDARTxtTextDetAnnParser',
encoding='utf-8-sig',
format='x1,y1,x2,y2,x3,y3,x4,y4,lang,trans'),
packer=dict(type='TextDetPacker'),
dumper=dict(type='JsonDumper'),
)

config_generator = dict(
type='TextDetConfigGenerator',
val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')],
test_anns=None)

delete = [f'mlt{i}' for i in range(1, 9)
] + ['annotations', 'mlt_val_gt', 'mlt_train_gt']
82 changes: 82 additions & 0 deletions dataset_zoo/mlt/textrecog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
data_root = 'data/mlt'
cache_path = 'data/cache'
train_preparer = dict(
obtainer=dict(
type='NaiveDataObtainer',
cache_path=cache_path,
files=[
dict(
url='https://datasets.cvc.uab.es/rrc/'
'ch8_training_word_images_gt_part_1.zip',
save_name='mlt_rec_1.zip',
md5='714d899cf5c8cf23b73bc14cfb628a3a',
content=['image'],
mapping=[['mlt_rec_1/*', 'textrecog_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/'
'ch8_training_word_images_gt_part_2.zip',
save_name='mlt_rec_2.zip',
md5='d0e5bc4736626853203d24c70bbf56d1',
content=['image'],
mapping=[['mlt_rec_2/*', 'textrecog_imgs/train']]),
dict(
url='https://datasets.cvc.uab.es/rrc/'
'ch8_training_word_images_gt_part_3.zip',
save_name='mlt_rec_3.zip',
md5='ebc7f2c9e73c3d174437d43b03177c5c',
content=['image'],
mapping=[['mlt_rec_3/*', 'textrecog_imgs/train']]),
dict(
url='https://rrc.cvc.uab.es/downloads/'
'ch8_validation_word_images_gt.zip',
save_name='mlt_rec_train_gt.zip',
md5='e5e681b440a616f0ac8deaa669b3682d',
content=['annotation'],
mapping=[['mlt_rec_train_gt/', 'annotations/train']]),
]),
gatherer=dict(type='MonoGatherer', ann_name='train/gt.txt'),
parser=dict(
type='ICDARTxtTextRecogAnnParser',
encoding='utf-8-sig',
format='img,lang,text'),
packer=dict(type='TextRecogPacker'),
dumper=dict(type='JsonDumper'),
)

val_preparer = dict(
obtainer=dict(
type='NaiveDataObtainer',
cache_path=cache_path,
files=[
dict(
url='https://rrc.cvc.uab.es/downloads/'
'ch8_validation_word_images_gt.zip',
save_name='mlt_rec_val.zip',
md5='954acd0325c442288fa4aff1009b6d79',
content=['image'],
mapping=[['mlt_rec_val/*', 'textrecog_imgs/val']]),
dict(
url='https://datasets.cvc.uab.es/rrc/'
'ch8_validation_word_gt_v2.zip',
save_name='mlt_rec_val_gt.zip',
md5='951c9cee78a0064b133ab59369a9b232',
content=['annotation'],
mapping=[['mlt_rec_val_gt/', 'annotations/val']]),
]),
gatherer=dict(type='MonoGatherer', ann_name='train/gt.txt'),
parser=dict(
type='ICDARTxtTextRecogAnnParser',
encoding='utf-8-sig',
format='img,lang,text'),
packer=dict(type='TextRecogPacker'),
dumper=dict(type='JsonDumper'),
)

config_generator = dict(
type='TextRecogConfigGenerator',
val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')],
test_anns=None)

delete = [f'mlt_rec_{i}' for i in range(1, 4)] + [
'annotations', 'mlt_rec_val_gt', 'mlt_rec_train_gt', 'mlt_rec_val'
]
9 changes: 9 additions & 0 deletions dataset_zoo/mlt/textspotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
_base_ = ['textdet.py']

_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
_base_.train_preparer.packer.type = 'TextSpottingPacker'

_base_.val_preparer.gatherer.img_dir = 'textdet_imgs/val'
_base_.val_preparer.packer.type = 'TextSpottingPacker'

config_generator = dict(type='TextSpottingConfigGenerator')
24 changes: 17 additions & 7 deletions mmocr/datasets/preparers/obtainers/naive_data_obtainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,22 +177,32 @@ def move(self, mapping: List[Tuple[str, str]]) -> None:

Args:
mapping (List[Tuple[str, str]]): A list of tuples, each
tuple contains the source file name and the destination file name.
tuple contains the source file name and the destination file
name.
"""
for src, dst in mapping:
src = osp.join(self.data_root, src)
dst = osp.join(self.data_root, dst)

if '*' in src:
# dst must be a directory
mkdir_or_exist(dst)
for f in glob.glob(src):
if not osp.exists(
osp.join(dst, osp.relpath(f, self.data_root))):
tgt = osp.join(dst, osp.basename(osp.normpath(f)))
if not osp.exists(tgt):
shutil.move(f, dst)

elif osp.exists(src) and not osp.exists(dst):
mkdir_or_exist(osp.dirname(dst))
shutil.move(src, dst)
else:
print(f'Skipping moving {f} to {dst} since'
f' {f} does not exist or {tgt} already exists')
# If no wildcard in src, dst must match the src type
# That is, we can only move a file to a file, or a dir to a dir
else:
if osp.exists(src) and not osp.exists(dst):
mkdir_or_exist(osp.dirname(dst))
shutil.move(src, dst)
else:
print(f'Skipping moving {src} to {dst} since'
f' {src} does not exist or {dst} already exists')

def clean(self) -> None:
"""Remove empty dirs."""
Expand Down
7 changes: 6 additions & 1 deletion mmocr/datasets/preparers/parsers/icdar_txt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ class ICDARTxtTextDetAnnParser(BaseParser):
to ','.
ignore (str): The text to be ignored. Defaults to '###'.
format (str): The format of the annotation. Defaults to
'x1,y1,x2,y2,x3,y3,x4,trans'.
'x1,y1,x2,y2,x3,y3,x4,trans'. An additional keyword "lang" can be
recognized here to specify the language of the transcription.
encoding (str): The encoding of the annotation file. Defaults to
'utf-8-sig'.
nproc (int): The number of processes to parse the annotation. Defaults
Expand Down Expand Up @@ -52,6 +53,8 @@ def parse_file(self, img_path: str, ann_path: str) -> Tuple:
instances = list()
for anno in self.loader(ann_path, self.sep, self.format,
self.encoding):
if 'lang' in anno:
del anno['lang']
anno = list(anno.values())
if self.remove_strs is not None:
for strs in self.remove_strs:
Expand Down Expand Up @@ -82,6 +85,8 @@ class ICDARTxtTextRecogAnnParser(BaseParser):
to ','.
ignore (str): The text to be ignored. Defaults to '#'.
format (str): The format of the annotation. Defaults to 'img, text'.
An additional keyword "lang" can be recognized here to specify the
language of the transcription.
encoding (str): The encoding of the annotation file. Defaults to
'utf-8-sig'.
nproc (int): The number of processes to parse the annotation. Defaults
Expand Down
Loading