Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add the detext to beta dataset preparer #1874

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions configs/textdet/_base_/datasets/detext.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As dataset preparer will generate this config, we can remove it from this PR. Same for configs/textrecog/base/datasets/detext.py and configs/textspotting/base/datasets/detext.py

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
detext_textdet_data_root = 'data/detext'

detext_textdet_train = dict(
type='OCRDataset',
data_root=detext_textdet_data_root,
ann_file='textdet_train.json',
filter_cfg=dict(filter_empty_gt=True, min_size=32),
pipeline=None)

detext_textdet_test = dict(
type='OCRDataset',
data_root=detext_textdet_data_root,
ann_file='textdet_test.json',
test_mode=True,
pipeline=None)
14 changes: 14 additions & 0 deletions configs/textrecog/_base_/datasets/detext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
detext_textrecog_data_root = 'data/detext'

detext_textrecog_train = dict(
type='OCRDataset',
data_root=detext_textrecog_data_root,
ann_file='textrecog_train.json',
pipeline=None)

detext_textrecog_test = dict(
type='OCRDataset',
data_root=detext_textrecog_data_root,
ann_file='textrecog_test.json',
test_mode=True,
pipeline=None)
15 changes: 15 additions & 0 deletions configs/textspotting/_base_/datasets/detext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
detext_textspotting_data_root = 'data/detext'

detext_textspotting_train = dict(
type='OCRDataset',
data_root=detext_textspotting_data_root,
ann_file='textspotting_train.json',
filter_cfg=dict(filter_empty_gt=True, min_size=32),
pipeline=None)

detext_textspotting_test = dict(
type='OCRDataset',
data_root=detext_textspotting_data_root,
ann_file='textspotting_test.json',
test_mode=True,
pipeline=None)
31 changes: 31 additions & 0 deletions dataset_zoo/detext/metafile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Name: 'DETEXT'
Paper:
Title: A Database for Evaluating Text Extraction from Biomedical Literature Figures
URL: http://cs-chan.com/doc/ESWA_2014A.pdf
Venue: ESWA
Year: '2015'
BibTeX: '@article{article,
author = {Yin, Xu-Cheng and Yang, Chun and Pei, Wei-Yi and Man, Haixia and Zhang, Jun and Learned-Miller, Erik and Yu, Hong},
year = {2015},
month = {05},
pages = {e0126200},
title = {DeTEXT: A Database for Evaluating Text Extraction from Biomedical Literature Figures},
volume = {10},
journal = {PloS one},
doi = {10.1371/journal.pone.0126200}}'
Data:
Website: https://rrc.cvc.uab.es/?ch=9
Language:
- English
Scene:
- biomedical
Granularity:
- Word
Tasks:
- textrecog
- textdet
- textspotting
License:
Type: CC BY 1.0
Link: https://creativecommons.org/licenses/by/1.0/
Format: .txt
63 changes: 63 additions & 0 deletions dataset_zoo/detext/textdet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
data_root = 'data/detext'
cache_path = 'data/cache'

train_preparer = dict(
obtainer=dict(
type='NaiveDataObtainer',
cache_path=cache_path,
files=[
dict(
url='https://rrc.cvc.uab.es/downloads/'
'ch9_training_images.zip',
save_name='detext_textdet_train_img.zip',
md5='e07161d6af1ef2f81f9ba0d2f904e377',
content=['image'],
mapping=[['detext_textdet_train_img', 'textdet_imgs/train']]),
dict(
url='https://rrc.cvc.uab.es/downloads/'
'ch9_training_localization_transcription_gt.zip',
save_name='detext_textdet_train_gt.zip',
md5='ae4dfe155e61dcfeadd80f6b0fd15626',
content=['annotation'],
mapping=[['detext_textdet_train_gt', 'annotations/train']]),
]),
gatherer=dict(
type='PairGatherer',
img_suffixes=['.jpg'],
rule=[r'(\w+)\.jpg', r'gt_\1.txt']),
parser=dict(type='DetextDetAnnParser', encoding='utf-8-sig'),
packer=dict(type='TextDetPacker'),
dumper=dict(type='JsonDumper'),
)

test_preparer = dict(
obtainer=dict(
type='NaiveDataObtainer',
cache_path=cache_path,
files=[
dict(
url='https://rrc.cvc.uab.es/downloads/'
'ch9_validation_images.zip',
save_name='detext_textdet_test_img.zip',
md5='c6ffe0abe6f2d7b4d70e6883257308e0',
content=['image'],
mapping=[['detext_textdet_test_img', 'textdet_imgs/test']]),
dict(
url='https://rrc.cvc.uab.es/downloads/'
'ch9_validation_localization_transcription_gt.zip',
save_name='detext_textdet_test_gt.zip',
md5='075c4b27ab2848c90ad5e87d9f922bc3',
content=['annotation'],
mapping=[['detext_textdet_test_gt', 'annotations/test']]),
]),
gatherer=dict(
type='PairGatherer',
img_suffixes=['.jpg', '.JPG'],
rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
parser=dict(type='DetextDetAnnParser', encoding='utf-8-sig'),
packer=dict(type='TextDetPacker'),
dumper=dict(type='JsonDumper'),
)

delete = ['detext_textdet_train_img', 'annotations', 'detext_textdet_test_img']
config_generator = dict(type='TextDetConfigGenerator')
9 changes: 9 additions & 0 deletions dataset_zoo/detext/textrecog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
_base_ = ['textdet.py']

_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'

_base_.train_preparer.packer.type = 'TextRecogCropPacker'
_base_.test_preparer.packer.type = 'TextRecogCropPacker'

config_generator = dict(type='TextRecogConfigGenerator')
8 changes: 8 additions & 0 deletions dataset_zoo/detext/textspotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
_base_ = ['textdet.py']
_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'

_base_.train_preparer.packer.type = 'TextSpottingPacker'
_base_.test_preparer.packer.type = 'TextSpottingPacker'

config_generator = dict(type='TextSpottingConfigGenerator')
3 changes: 2 additions & 1 deletion mmocr/datasets/preparers/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .base import BaseParser
from .coco_parser import COCOTextDetAnnParser
from .ctw1500_parser import CTW1500AnnParser
from .detext_parser import DetextDetAnnParser
from .funsd_parser import FUNSDTextDetAnnParser
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
ICDARTxtTextRecogAnnParser)
Expand All @@ -18,5 +19,5 @@
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser',
'SynthTextAnnParser', 'MJSynthAnnParser'
'SynthTextAnnParser', 'MJSynthAnnParser', 'DetextDetAnnParser'
]
68 changes: 68 additions & 0 deletions mmocr/datasets/preparers/parsers/detext_parser.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that DetextDetAnnParser is exactly the same as ICDARTxtTextDetAnnParser? In this case, this implementation can be removed, and we can use reuse ICDARTxtTextDetAnnParser in Detext's config.

Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional, Tuple

from mmocr.registry import DATA_PARSERS
from mmocr.utils import bbox2poly
from .base import BaseParser


@DATA_PARSERS.register_module()
class DetextDetAnnParser(BaseParser):
"""Detext Txt Format Text Detection Annotation Parser.

The original annotation format of this dataset is stored in txt files,
which is formed as the following format:
x1, y1, x2, y2, x3, y3, x4, y4, transcription

Args:
separator (str): The separator between each element in a line. Defaults
to ','.
ignore (str): The text to be ignored. Defaults to '###'.
format (str): The format of the annotation. Defaults to
'x1,y1,x2,y2,x3,y3,x4,trans'.
encoding (str): The encoding of the annotation file. Defaults to
'utf-8-sig'.
nproc (int): The number of processes to parse the annotation. Defaults
to 1.
remove_strs (List[str], Optional): Used to remove redundant strings in
the transcription. Defaults to None.
mode (str, optional): The mode of the box converter. Supported modes
are 'xywh' and 'xyxy'. Defaults to None.
"""

def __init__(self,
separator: str = ',',
ignore: str = '###',
format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
encoding: str = 'utf-8',
remove_strs: Optional[List[str]] = None,
mode: str = None,
**kwargs) -> None:
self.sep = separator
self.format = format
self.encoding = encoding
self.ignore = ignore
self.mode = mode
self.remove_strs = remove_strs
super().__init__(**kwargs)

def parse_file(self, img_path: str, ann_path: str) -> Tuple:
"""Parse single annotation."""
instances = list()
for anno in self.loader(ann_path, self.sep, self.format,
self.encoding):
anno = list(anno.values())
if self.remove_strs is not None:
for strs in self.remove_strs:
for i in range(len(anno)):
if strs in anno[i]:
anno[i] = anno[i].replace(strs, '')
poly = list(map(float, anno[0:-1]))
if self.mode is not None:
poly = bbox2poly(poly, self.mode)
poly = poly.tolist()
text = anno[-1]
instances.append(
dict(poly=poly, text=text, ignore=text == self.ignore))

return img_path, instances