From a1a3c97703734a81e56689eed777dfba0fd8ed68 Mon Sep 17 00:00:00 2001
From: gaotongxiao <gaotongxiao@gmail.com>
Date: Fri, 24 Mar 2023 20:11:27 +0800
Subject: [PATCH 1/2] [Feature] MLT Dataset Preparer

---
 dataset_zoo/mlt/metafile.yml                  |  39 ++++++
 dataset_zoo/mlt/sample_anno.md                |  20 +++
 dataset_zoo/mlt/textdet.py                    | 114 ++++++++++++++++++
 dataset_zoo/mlt/textrecog.py                  |  82 +++++++++++++
 dataset_zoo/mlt/textspotting.py               |   9 ++
 .../obtainers/naive_data_obtainer.py          |  24 ++--
 .../preparers/parsers/icdar_txt_parser.py     |   7 +-
 .../test_naive_data_obtainer.py               |  71 +++++++++++
 8 files changed, 358 insertions(+), 8 deletions(-)
 create mode 100644 dataset_zoo/mlt/metafile.yml
 create mode 100644 dataset_zoo/mlt/sample_anno.md
 create mode 100644 dataset_zoo/mlt/textdet.py
 create mode 100644 dataset_zoo/mlt/textrecog.py
 create mode 100644 dataset_zoo/mlt/textspotting.py
 create mode 100644 tests/test_datasets/test_preparers/test_obtainers/test_naive_data_obtainer.py

diff --git a/dataset_zoo/mlt/metafile.yml b/dataset_zoo/mlt/metafile.yml
new file mode 100644
index 000000000..f25256b3c
--- /dev/null
+++ b/dataset_zoo/mlt/metafile.yml
@@ -0,0 +1,39 @@
+Name: 'MLT 2017 (ICDAR 2017)'
+Paper:
+  Title: ICDAR2017 Robust Reading Challenge on Multi-Lingual Scene Text Detection and Script Identification - RRC-MLT
+  URL: https://ieeexplore.ieee.org/document/8270168
+  Venue: ICDAR
+  Year: '2017'
+  BibTeX: '@INPROCEEDINGS{8270168,
+  author={Nayef, Nibal and Yin, Fei and Bizid, Imen and Choi, Hyunsoo and Feng, Yuan and Karatzas, Dimosthenis and Luo, Zhenbo and Pal, Umapada and Rigaud, Christophe and Chazalon, Joseph and Khlif, Wafa and Luqman, Muhammad Muzzamil and Burie, Jean-Christophe and Liu, Cheng-lin and Ogier, Jean-Marc},
+  booktitle={2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR)},
+  title={ICDAR2017 Robust Reading Challenge on Multi-Lingual Scene Text Detection and Script Identification - RRC-MLT},
+  year={2017},
+  volume={01},
+  number={},
+  pages={1454-1459},
+  doi={10.1109/ICDAR.2017.237}}'
+Data:
+  Website: https://rrc.cvc.uab.es/?ch=8
+  Language:
+    - Arabic
+    - English
+    - Chinese
+    - Japanese
+    - Korean
+    - Italian
+    - German
+    - Indian
+    - French
+  Scene:
+    - Natural Scene
+  Granularity:
+    - Word
+  Tasks:
+    - textdet
+    - textrecog
+    - textspotting
+  License:
+    Type: CC BY 4.0
+    Link: https://creativecommons.org/licenses/by/4.0/
+  Format: .txt
diff --git a/dataset_zoo/mlt/sample_anno.md b/dataset_zoo/mlt/sample_anno.md
new file mode 100644
index 000000000..93caf33d7
--- /dev/null
+++ b/dataset_zoo/mlt/sample_anno.md
@@ -0,0 +1,20 @@
+**Text Detection, Text Spotting**
+
+```text
+# x1,y1,x2,y2,x3,y3,x4,y4,script,text
+# Valid scripts are: "Arabic", "Latin", "Chinese", "Japanese", "Korean", "Bangla", "Symbols", "Mixed", "None"
+
+131,34,181,34,179,47,131,49,Latin,Castle
+150,59,194,58,196,72,150,73,Arabic,متحف
+90,83,143,83,143,96,91,96,Latin,Heritage
+146,81,200,80,201,93,147,94,Latin,Museum
+```
+
+**Text Recognition**
+
+```text
+# img_name,script,text
+
+word_4.png,Arabic,المكرمة
+word_5.png,Latin,MAKKA
+```
diff --git a/dataset_zoo/mlt/textdet.py b/dataset_zoo/mlt/textdet.py
new file mode 100644
index 000000000..a436ae010
--- /dev/null
+++ b/dataset_zoo/mlt/textdet.py
@@ -0,0 +1,114 @@
+data_root = 'data/mlt'
+cache_path = 'data/cache'
+# yapf: disable
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/ch8_training_images_1.zip',  # noqa: E501
+                save_name='mlt_1.zip',
+                md5='7b26e10d949c00fb4411f40b4f1fce6e',
+                content=['image'],
+                mapping=[['mlt_1/*', 'textdet_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/ch8_training_images_2.zip',  # noqa: E501
+                save_name='mlt_2.zip',
+                md5='e992fb5a7621dd6329081a73e52a28e1',
+                content=['image'],
+                mapping=[['mlt_2/*', 'textdet_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/ch8_training_images_3.zip',  # noqa: E501
+                save_name='mlt_3.zip',
+                md5='044ea5fb1dcec8bbb874391c517b55ff',
+                content=['image'],
+                mapping=[['mlt_3/*', 'textdet_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/ch8_training_images_4.zip',  # noqa: E501
+                save_name='mlt_4.zip',
+                md5='344a657c1cc7cbb150547f1c76b5cc8e',
+                content=['image'],
+                mapping=[['mlt_4/*', 'textdet_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/ch8_training_images_5.zip',  # noqa: E501
+                save_name='mlt_5.zip',
+                md5='5c7ac0158e7189c0a634eaf7bdededc5',
+                content=['image'],
+                mapping=[['mlt_5/*', 'textdet_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/ch8_training_images_6.zip',  # noqa: E501
+                save_name='mlt_6.zip',
+                md5='3b479255a96d255680f51005b5232bac',
+                content=['image'],
+                mapping=[['mlt_6/*', 'textdet_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/ch8_training_images_7.zip',  # noqa
+                save_name='mlt_7.zip',
+                md5='faa033fb9d2922d747bad9b0692c992e',
+                content=['image'],
+                mapping=[['mlt_7/*', 'textdet_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/ch8_training_images_8.zip',  # noqa
+                save_name='mlt_8.zip',
+                md5='db8afa59ae520757151f6ce5acd489ef',
+                content=['image'],
+                mapping=[['mlt_8/*', 'textdet_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/'
+                'ch8_training_localization_transcription_gt_v2.zip',
+                save_name='mlt_train_gt.zip',
+                md5='2c9c3de30b5615f6846738bbd336c988',
+                content=['annotation'],
+                mapping=[['mlt_train_gt/', 'annotations/train']]),
+        ]),
+    gatherer=dict(
+        type='PairGatherer',
+        img_suffixes=['.jpg', '.JPG'],
+        rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
+    parser=dict(
+        type='ICDARTxtTextDetAnnParser',
+        encoding='utf-8-sig',
+        format='x1,y1,x2,y2,x3,y3,x4,y4,lang,trans'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'),
+)  # noqa
+
+val_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/ch8_validation_images.zip',  # noqa
+                save_name='mlt_val_img.zip',
+                md5='3cfc7b440ab81b89a981d707786dbe83',
+                content=['image'],
+                mapping=[['mlt_val_img', 'textdet_imgs/val']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/'
+                'ch8_validation_localization_transcription_gt_v2.zip',
+                save_name='mlt_val_gt.zip',
+                md5='ecae7d433e6f103bb31e00d37254009c',
+                content=['annotation'],
+                mapping=[['mlt_val_gt/', 'annotations/val']]),
+        ]),
+    gatherer=dict(
+        type='PairGatherer',
+        img_suffixes=['.jpg', '.JPG'],
+        rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
+    parser=dict(
+        type='ICDARTxtTextDetAnnParser',
+        encoding='utf-8-sig',
+        format='x1,y1,x2,y2,x3,y3,x4,y4,lang,trans'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+
+config_generator = dict(
+    type='TextDetConfigGenerator',
+    val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')],
+    test_anns=None)
+
+delete = [f'mlt{i}' for i in range(1, 9)
+          ] + ['annotations', 'mlt_val_gt', 'mlt_train_gt']
diff --git a/dataset_zoo/mlt/textrecog.py b/dataset_zoo/mlt/textrecog.py
new file mode 100644
index 000000000..46a347a38
--- /dev/null
+++ b/dataset_zoo/mlt/textrecog.py
@@ -0,0 +1,82 @@
+data_root = 'data/mlt'
+cache_path = 'data/cache'
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/'
+                'ch8_training_word_images_gt_part_1.zip',
+                save_name='mlt_rec_1.zip',
+                md5='714d899cf5c8cf23b73bc14cfb628a3a',
+                content=['image'],
+                mapping=[['mlt_rec_1/*', 'textrecog_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/'
+                'ch8_training_word_images_gt_part_2.zip',
+                save_name='mlt_rec_2.zip',
+                md5='d0e5bc4736626853203d24c70bbf56d1',
+                content=['image'],
+                mapping=[['mlt_rec_2/*', 'textrecog_imgs/train']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/'
+                'ch8_training_word_images_gt_part_3.zip',
+                save_name='mlt_rec_3.zip',
+                md5='ebc7f2c9e73c3d174437d43b03177c5c',
+                content=['image'],
+                mapping=[['mlt_rec_3/*', 'textrecog_imgs/train']]),
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'ch8_validation_word_images_gt.zip',
+                save_name='mlt_rec_train_gt.zip',
+                md5='e5e681b440a616f0ac8deaa669b3682d',
+                content=['annotation'],
+                mapping=[['mlt_rec_train_gt/', 'annotations/train']]),
+        ]),
+    gatherer=dict(type='MonoGatherer', ann_name='train/gt.txt'),
+    parser=dict(
+        type='ICDARTxtTextRecogAnnParser',
+        encoding='utf-8-sig',
+        format='img,lang,text'),
+    packer=dict(type='TextRecogPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+
+val_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'ch8_validation_word_images_gt.zip',
+                save_name='mlt_rec_val.zip',
+                md5='',
+                content=['image'],
+                mapping=[['mlt_rec_val/*', 'textrecog_imgs/val']]),
+            dict(
+                url='https://datasets.cvc.uab.es/rrc/'
+                'ch8_validation_word_gt_v2.zip',
+                save_name='mlt_rec_val_gt.zip',
+                md5='951c9cee78a0064b133ab59369a9b232',
+                content=['annotation'],
+                mapping=[['mlt_rec_val_gt/', 'annotations/val']]),
+        ]),
+    gatherer=dict(type='MonoGatherer', ann_name='train/gt.txt'),
+    parser=dict(
+        type='ICDARTxtTextRecogAnnParser',
+        encoding='utf-8-sig',
+        format='img,lang,text'),
+    packer=dict(type='TextRecogPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+
+config_generator = dict(
+    type='TextRecogConfigGenerator',
+    val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')],
+    test_anns=None)
+
+delete = [f'mlt_rec_{i}' for i in range(1, 4)] + [
+    'annotations', 'mlt_rec_val_gt', 'mlt_rec_train_gt', 'mlt_rec_val'
+]
diff --git a/dataset_zoo/mlt/textspotting.py b/dataset_zoo/mlt/textspotting.py
new file mode 100644
index 000000000..dd6c91126
--- /dev/null
+++ b/dataset_zoo/mlt/textspotting.py
@@ -0,0 +1,9 @@
+_base_ = ['textdet.py']
+
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+
+_base_.val_preparer.gatherer.img_dir = 'textdet_imgs/val'
+_base_.val_preparer.packer.type = 'TextSpottingPacker'
+
+config_generator = dict(type='TextSpottingConfigGenerator')
diff --git a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py
index 51b0d266c..d5bd28ad8 100644
--- a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py
+++ b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py
@@ -177,22 +177,32 @@ def move(self, mapping: List[Tuple[str, str]]) -> None:
 
         Args:
             mapping (List[Tuple[str, str]]): A list of tuples, each
-            tuple contains the source file name and the destination file name.
+                tuple contains the source file name and the destination file
+                name.
         """
         for src, dst in mapping:
             src = osp.join(self.data_root, src)
             dst = osp.join(self.data_root, dst)
 
             if '*' in src:
+                # dst must be a directory
                 mkdir_or_exist(dst)
                 for f in glob.glob(src):
-                    if not osp.exists(
-                            osp.join(dst, osp.relpath(f, self.data_root))):
+                    tgt = osp.join(dst, osp.basename(osp.normpath(f)))
+                    if not osp.exists(tgt):
                         shutil.move(f, dst)
-
-            elif osp.exists(src) and not osp.exists(dst):
-                mkdir_or_exist(osp.dirname(dst))
-                shutil.move(src, dst)
+                    else:
+                        print(f'Skipping moving {f} to {dst} since'
+                              f' {f} does not exist or {tgt} already exists')
+            # If no wildcard in src, dst must match the src type
+            # That is, we can only move a file to a file, or a dir to a dir
+            else:
+                if osp.exists(src) and not osp.exists(dst):
+                    mkdir_or_exist(osp.dirname(dst))
+                    shutil.move(src, dst)
+                else:
+                    print(f'Skipping moving {src} to {dst} since'
+                          f' {src} does not exist or {dst} already exists')
 
     def clean(self) -> None:
         """Remove empty dirs."""
diff --git a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py
index e90d5d7b9..4fa1d9b9f 100644
--- a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py
+++ b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py
@@ -20,7 +20,8 @@ class ICDARTxtTextDetAnnParser(BaseParser):
             to ','.
         ignore (str): The text to be ignored. Defaults to '###'.
         format (str): The format of the annotation. Defaults to
-            'x1,y1,x2,y2,x3,y3,x4,trans'.
+            'x1,y1,x2,y2,x3,y3,x4,trans'. An additional keyword "lang" can be
+            recognized here to specify the language of the transcription.
         encoding (str): The encoding of the annotation file. Defaults to
             'utf-8-sig'.
         nproc (int): The number of processes to parse the annotation. Defaults
@@ -52,6 +53,8 @@ def parse_file(self, img_path: str, ann_path: str) -> Tuple:
         instances = list()
         for anno in self.loader(ann_path, self.sep, self.format,
                                 self.encoding):
+            if 'lang' in anno:
+                del anno['lang']
             anno = list(anno.values())
             if self.remove_strs is not None:
                 for strs in self.remove_strs:
@@ -82,6 +85,8 @@ class ICDARTxtTextRecogAnnParser(BaseParser):
             to ','.
         ignore (str): The text to be ignored. Defaults to '#'.
         format (str): The format of the annotation. Defaults to 'img, text'.
+            An additional keyword "lang" can be recognized here to specify the
+            language of the transcription.
         encoding (str): The encoding of the annotation file. Defaults to
             'utf-8-sig'.
         nproc (int): The number of processes to parse the annotation. Defaults
diff --git a/tests/test_datasets/test_preparers/test_obtainers/test_naive_data_obtainer.py b/tests/test_datasets/test_preparers/test_obtainers/test_naive_data_obtainer.py
new file mode 100644
index 000000000..dd876e39f
--- /dev/null
+++ b/tests/test_datasets/test_preparers/test_obtainers/test_naive_data_obtainer.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers.obtainers import NaiveDataObtainer
+
+
+class TestNaiveDataObtainer(unittest.TestCase):
+
+    def setUp(self) -> None:
+        """Create temporary directories and files for testing."""
+        self.tmp_dir = tempfile.TemporaryDirectory()
+        self.cache_path = osp.join(self.tmp_dir.name, 'cache')
+        self.data_root = osp.join(self.tmp_dir.name, 'data')
+        self.obtainer = NaiveDataObtainer([], self.cache_path, self.data_root,
+                                          'test')
+
+    def tearDown(self) -> None:
+        """Delete temporary directories and files used for testing."""
+        self.tmp_dir.cleanup()
+
+    def test_move(self):
+        # create tmp files
+        test_src = os.path.join(self.data_root, 'src')
+        test_dst = os.path.join(self.data_root, 'dst')
+        os.makedirs(test_src, exist_ok=True)
+        os.makedirs(test_dst, exist_ok=True)
+        # Create some test files/folders in src directory
+        for i in range(3):
+            with open(os.path.join(test_src, f'file{i}.txt'), 'w') as f:
+                f.write('hello world\n')
+            os.mkdir(os.path.join(test_src, f'dir{i}'))
+
+        # Test moving file/dir
+        mapping = [
+            ('src/file0.txt', 'dst/file0_new.txt'),  # dst/file0_new.txt
+            ('src/file1.txt', 'dst/abc/abc.txt'),  # dst/abc.txt
+            ('src/file2.txt', 'dst/'),  # Not allowed
+            ('src/dir0/', 'dst/dir0'),  # dst/dir0
+            ('src/dir1', 'dst/abc/d2/'),  # dst/abc/d2
+            ('src/dir2', 'dst/'),  # not allowed
+        ]
+        self.obtainer.move(mapping)
+
+        mapping[2] = ['src/file2.txt', 'dst/file2.txt']
+        mapping[5] = ['src/dir2', 'dst/dir2']
+        mapping = [[osp.join(self.data_root, a),
+                    osp.join(self.data_root, b)] for a, b in mapping]
+        mapping[2] = mapping[2][::-1]
+        mapping[5] = mapping[5][::-1]
+        for a, b in mapping:
+            self.assertFalse(os.path.exists(a))
+            self.assertTrue(os.path.exists(b))
+
+        # Test moving paths with wildcard
+        mapping = [
+            ('src/*.txt', 'dst/test2'),  # dst/test2/file2.txt
+            ('src/*', 'dst/test2/file2.txt'),  # not allowed (file2.txt exists)
+            ('src/*', 'dst/test2'),  # dst/dir2
+        ]
+        self.obtainer.move(mapping)
+
+        mapping = [
+            osp.join(self.data_root, p)
+            for p in ['dst/test2/file2.txt', 'dst/test2/dir2']
+        ]
+        for a, b in mapping:
+            self.assertFalse(os.path.exists(a))
+            self.assertTrue(os.path.exists(b))

From 060b1b799d19df2d595e669ed3652ebf7c65b61d Mon Sep 17 00:00:00 2001
From: gaotongxiao <gaotongxiao@gmail.com>
Date: Tue, 28 Mar 2023 14:47:24 +0800
Subject: [PATCH 2/2] update md5

---
 dataset_zoo/mlt/textrecog.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_zoo/mlt/textrecog.py b/dataset_zoo/mlt/textrecog.py
index 46a347a38..8908938be 100644
--- a/dataset_zoo/mlt/textrecog.py
+++ b/dataset_zoo/mlt/textrecog.py
@@ -52,7 +52,7 @@
                 url='https://rrc.cvc.uab.es/downloads/'
                 'ch8_validation_word_images_gt.zip',
                 save_name='mlt_rec_val.zip',
-                md5='',
+                md5='954acd0325c442288fa4aff1009b6d79',
                 content=['image'],
                 mapping=[['mlt_rec_val/*', 'textrecog_imgs/val']]),
             dict(