From 55ff03ccd093d4331527fad6309bcf0a33bb664a Mon Sep 17 00:00:00 2001
From: Mountchciken <mountchicken@outlook.com>
Date: Wed, 14 Dec 2022 13:31:59 +0800
Subject: [PATCH 1/7] add one paper

---
 ...ing CTC training as iterative fitting.yaml | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 paper_zoo/textrecog/Reinterpreting CTC training as iterative fitting.yaml

diff --git a/paper_zoo/textrecog/Reinterpreting CTC training as iterative fitting.yaml b/paper_zoo/textrecog/Reinterpreting CTC training as iterative fitting.yaml
new file mode 100644
index 000000000..7b20a5b42
--- /dev/null
+++ b/paper_zoo/textrecog/Reinterpreting CTC training as iterative fitting.yaml	
@@ -0,0 +1,65 @@
+Title: 'Reinterpreting CTC training as iterative fitting'
+Abbreviation: PerSec
+Tasks:
+ - TextRecog
+Venue: AAAI
+Year: 2022
+Lab/Company:
+ - Tencent YouTu Lab
+ - University of Science and Technology of China
+URL: 'https://www.sciencedirect.com/science/article/pii/S0031320320301953'
+Paper Reading URL: N/A
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+ - Dataset
+Abstract: 'We introduce Perceiving Stroke-Semantic Context (PerSec), a new
+approach to self-supervised representation learning tailored for Scene Text
+Recognition (STR) task. Considering scene text images carry both visual and
+semantic properties, we equip our PerSec with dual context perceivers which
+can contrast and learn latent representations from low-level stroke and
+high-level semantic contextual spaces simultaneously via hierarchical
+contrastive learning on unlabeled text image data. Experiments in un- and
+semi-supervised learning settings on STR benchmarks demonstrate our
+proposed framework can yield a more robust representation for both
+CTC-based and attention-based decoders than other contrastive learning
+methods. To fully investigate the potential of our method, we also
+collect a dataset of 100 million unlabeled text images, named UTI-100M,
+covering 5 scenes and 4 languages. By leveraging hundred-million-level
+unlabeled data, our PerSec shows significant performance improvement
+when fine-tuning the learned representation on the labeled data.
+Furthermore, we observe that the representation learned by PerSec
+presents great generalization, especially under few labeled data scenes.'
+MODELS:
+ Architecture:
+  - CTC
+  - Attention
+  - Transformer
+ Learning Method:
+  - Self-Supervised
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'architecture.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Results:
+   Common Benchmarks:
+     IIIT: 88.1
+     SVT: 96.7
+     IC13: 94.2
+     IC15: 73.6
+     SVTP: 77.7
+     CUTE: 72.7
+     Avg.: 83.8
+Bibtex: '@inproceedings{liu2022perceiving,
+  title={Perceiving Stroke-Semantic Context: Hierarchical Contrastive Learning for Robust Scene Text Recognition},
+  author={Liu, Hao and Wang, Bin and Bao, Zhimin and Xue, Mobai and Kang, Sheng and Jiang, Deqiang and Liu, Yinsong and Ren, Bo},
+  year={2022},
+  organization={AAAI}}'

From 2d302472d8386d52b1f1cac586bbc74fe3da7178 Mon Sep 17 00:00:00 2001
From: Mountchciken <mountchicken@outlook.com>
Date: Mon, 26 Dec 2022 09:33:09 +0800
Subject: [PATCH 2/7] add 5 paper

---
 ...e Learning for Scene Text Recognition.yaml | 88 ++++++++++++++++++
 ...th Masked Encoder-Decoder Pretraining.yaml | 84 +++++++++++++++++
 ...ervised Learning for Text Recognition.yaml | 91 +++++++++++++++++++
 ...ing CTC training as iterative fitting.yaml | 65 -------------
 ...rastive Learning for Text Recognition.yaml | 84 +++++++++++++++++
 ...xt via Similarity-Aware Normalization.yaml | 86 ++++++++++++++++++
 6 files changed, 433 insertions(+), 65 deletions(-)
 create mode 100644 paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml
 create mode 100644 paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml
 delete mode 100644 paper_zoo/textrecog/Reinterpreting CTC training as iterative fitting.yaml
 create mode 100644 paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml

diff --git a/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml b/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml
new file mode 100644
index 000000000..322eca644
--- /dev/null
+++ b/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml	
@@ -0,0 +1,88 @@
+Title: 'Context-based Contrastive Learning for Scene Text Recognition'
+Abbreviation: ConCLR
+Tasks:
+ - TextRecog
+Venue: AAAI
+Year: 2022
+Lab/Company:
+ - The Chinese University of Hong Kong
+ - SmartMore
+URL: 'https://www.aaai.org/AAAI22Papers/AAAI-10147.ZhangX.pdf'
+Paper Reading URL: 'https://mp.weixin.qq.com/s/7ayYKALDc3-nsBgEJG-D2A'
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'Pursuing accurate and robust recognizers has been a longlasting goal
+for scene text recognition (STR) researchers. Recently, attention-based methods
+have demonstrated their effectiveness and achieved impressive results on public
+benchmarks. The attention mechanism enables models to recognize scene text with
+severe visual distortions by leveraging contextual information. However, recent
+studies revealed that the implicit over-reliance of context leads to catastrophic
+out-ofvocabulary performance. On the contrary to the superior accuracy of the
+seen text, models are prone to misrecognize unseen text even with good image
+quality. We propose a novel framework, Context-based contrastive learning
+(ConCLR), to alleviate this issue. Our proposed method first generates
+characters with different contexts via simple image concatenation operations
+and then optimizes contrastive loss on their embeddings. By pulling together
+clusters of identical characters within various contexts and pushing apart
+clusters of different characters in embedding space, ConCLR suppresses the
+side-effect of overfitting to specific contexts and learns a more robust
+representation. Experiments show that ConCLR significantly improves
+out-of-vocabulary generalization and achieves state-of-the-art performance on
+public benchmarks together with attention-based recognizers.'
+MODELS:
+ Architecture:
+  - CTC
+  - Attention
+  - Transformer
+ Learning Method:
+  - Self-Supervised
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209343799-96428e0e-9a93-4763-be47-a23f575dc2f3.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - MJ
+     - ST
+   Test DataSets:
+     Avg.: 92.4
+     IIIT5K:
+       WAICS: 96.5
+       WAIC: N/A
+       benchmark: True
+     SVT:
+       WAICS: 94.3
+       WAIC: N/A
+       benchmark: True
+     IC13:
+       WAICS: 97.7
+       WAIC: N/A
+       benchmark: True
+     IC15:
+       WAICS: 85.4
+       WAIC: N/A
+       benchmark: True
+     SVTP:
+       WAICS: 89.3
+       WAIC: N/A
+       benchmark: True
+     CUTE:
+       WAICS: 91.3
+       WAIC: N/A
+       benchmark: True
+     Other Datasets:
+Bibtex: '@inproceedings{zhang2022context,
+  title={Context-based Contrastive Learning for Scene Text Recognition},
+  author={Zhang, Xinyun and Zhu, Binwu and Yao, Xufeng and Sun, Qi and Li, Ruiyu and Yu, Bei},
+  year={2022},
+  organization={AAAI}
+}'
diff --git a/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml b/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml
new file mode 100644
index 000000000..3af6b35eb
--- /dev/null
+++ b/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml	
@@ -0,0 +1,84 @@
+Title: 'MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining'
+Abbreviation: MaskOCR
+Tasks:
+ - TextRecog
+Venue: arXiv
+Year: 2022
+Lab/Company:
+ - Department of Computer Vision Technology (VIS), Baidu Inc.
+URL: 'https://openaccess.thecvf.com/content/CVPR2022/html/Luo_SimAN_Exploring_Self-Supervised_Representation_Learning_of_Scene_Text_via_Similarity-Aware_CVPR_2022_paper.html'
+Paper Reading URL: 'https://mp.weixin.qq.com/s/UdEakobM85SAJ6OUU-Johg'
+Code: 'https://github.com/Canjie-Luo/Real-300K'
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+ - Dataset
+Abstract: 'In this paper, we present a model pretraining technique, named
+MaskOCR, for text recognition. Our text recognition architecture is an
+encoder-decoder transformer: the encoder extracts the patch-level
+representations, and the decoder recognizes the text from the representations.
+Our approach pretrains both the encoder and the decoder in a sequential manner.
+(i) We pretrain the encoder in a self-supervised manner over a large set of
+unlabeled real text images. We adopt the masked image modeling approach, which
+shows the effectiveness for general images, expecting that the representations
+take on semantics. (ii) We pretrain the decoder over a large set of synthesized
+text images in a supervised manner and enhance the language modeling capability
+of the decoder by randomly masking some text image patches occupied by
+characters input to the encoder and accordingly the representations input to
+the decoder. Experiments show that the proposed MaskOCR approach achieves
+superior results on the benchmark datasets, including Chinese and English text
+images.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Self-Supervised
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209343741-bd6ddbcb-6229-4f71-89ef-09ecc4bf7b65.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: 315M
+ Experiment:
+   Training DataSets:
+     - ST
+     - MJ
+     - Real
+   Test DataSets:
+     Avg.: 93.8
+     IIIT5K:
+       WAICS: 96.5
+       WAIC: N/A
+       benchmark: True
+     SVT:
+       WAICS: 94.1
+       WAIC: N/A
+       benchmark: True
+     IC13:
+       WAICS: 97.8
+       WAIC: N/A
+       benchmark: True
+     IC15:
+       WAICS: 88.7
+       WAIC: N/A
+       benchmark: True
+     SVTP:
+       WAICS: 90.2
+       WAIC: N/A
+       benchmark: True
+     CUTE:
+       WAICS: 92.7
+       WAIC: N/A
+       benchmark: True
+     Other Datasets:
+Bibtex: '@article{lyu2022maskocr,
+  title={MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining},
+  author={Lyu, Pengyuan and Zhang, Chengquan and Liu, Shanshan and Qiao, Meina and Xu, Yangliu and Wu, Liang and Yao, Kun and Han, Junyu and Ding, Errui and Wang, Jingdong},
+  journal={arXiv preprint arXiv:2206.00311},
+  year={2022}
+}'
diff --git a/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml b/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml
new file mode 100644
index 000000000..10e23102f
--- /dev/null
+++ b/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml	
@@ -0,0 +1,91 @@
+Title: 'Multimodal Semi-Supervised Learning for Text Recognition'
+Abbreviation: SemiMTR
+Tasks:
+ - TextRecog
+Venue: arXiv
+Year: 2022
+Lab/Company:
+ - AWS AI Labs
+URL: 'https://arxiv.org/abs/2211.04785'
+Paper Reading URL: N/A
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Dataset
+Abstract: 'Until recently, the number of public real-world text images was
+insufficient for training scene text recognizers. Therefore, most modern
+training methods rely on synthetic data and operate in a fully supervised
+manner. Nevertheless, the amount of public real-world text images has increased
+significantly lately, including a great deal of unlabeled data. Leveraging
+these resources requires semi-supervised approaches; however, the few existing
+methods do not account for vision-language multimodality structure and
+therefore suboptimal for state-of-the-art multimodal architectures. To bridge
+this gap, we present semi-supervised learning for multimodal text recognizers
+(SemiMTR) that leverages unlabeled data at each modality training phase.
+Notably, our method refrains from extra training stages and maintains the
+current three-stage multimodal training procedure. Our algorithm starts by
+pretraining the vision model through a single-stage training that unifies
+self-supervised learning with supervised training. More specifically, we extend
+an existing visual representation learning algorithm and propose the first
+contrastivebased method for scene text recognition. After pretraining the
+language model on a text corpus, we fine-tune the entire network via a
+sequential, character-level, consistency regularization between weakly and
+strongly augmented views of text images. In a novel setup, consistency is
+enforced on each modality separately. Extensive experiments validate that
+our method outperforms the current training schemes and achieves
+stateof-the-art results on multiple scene text recognition benchmarks.
+Code will be published upon publication.'
+MODELS:
+ Architecture:
+  - Attention
+ Learning Method:
+  - Self-Supervised
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209488117-5c6c6ee1-3419-4aec-97f5-1e1b28ae25ff.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - ST
+     - MJ
+     - Real
+   Test DataSets:
+     Avg.: 93.3
+     IIIT5K:
+       WAICS: 97.3
+       WAIC: N/A
+       benchmark: True
+     SVT:
+       WAICS: 96.6
+       WAIC: N/A
+       benchmark: True
+     IC13:
+       WAICS: 97.0
+       WAIC: N/A
+       benchmark: True
+     IC15:
+       WAICS: 84.7
+       WAIC: N/A
+       benchmark: True
+     SVTP:
+       WAICS: 93.0
+       WAIC: N/A
+       benchmark: True
+     CUTE:
+       WAICS: 93.8
+       WAIC: N/A
+       benchmark: True
+     Other Datasets:
+Bibtex: '@article{aberdam2022multimodal,
+  title={Multimodal Semi-Supervised Learning for Text Recognition},
+  author={Aberdam, Aviad and Ganz, Roy and Mazor, Shai and Litman, Ron},
+  journal={arXiv preprint arXiv:2205.03873},
+  year={2022}
+}'
diff --git a/paper_zoo/textrecog/Reinterpreting CTC training as iterative fitting.yaml b/paper_zoo/textrecog/Reinterpreting CTC training as iterative fitting.yaml
deleted file mode 100644
index 7b20a5b42..000000000
--- a/paper_zoo/textrecog/Reinterpreting CTC training as iterative fitting.yaml	
+++ /dev/null
@@ -1,65 +0,0 @@
-Title: 'Reinterpreting CTC training as iterative fitting'
-Abbreviation: PerSec
-Tasks:
- - TextRecog
-Venue: AAAI
-Year: 2022
-Lab/Company:
- - Tencent YouTu Lab
- - University of Science and Technology of China
-URL: 'https://www.sciencedirect.com/science/article/pii/S0031320320301953'
-Paper Reading URL: N/A
-Code: N/A
-Supported In MMOCR: N/S
-PaperType:
- - Algorithm
- - Dataset
-Abstract: 'We introduce Perceiving Stroke-Semantic Context (PerSec), a new
-approach to self-supervised representation learning tailored for Scene Text
-Recognition (STR) task. Considering scene text images carry both visual and
-semantic properties, we equip our PerSec with dual context perceivers which
-can contrast and learn latent representations from low-level stroke and
-high-level semantic contextual spaces simultaneously via hierarchical
-contrastive learning on unlabeled text image data. Experiments in un- and
-semi-supervised learning settings on STR benchmarks demonstrate our
-proposed framework can yield a more robust representation for both
-CTC-based and attention-based decoders than other contrastive learning
-methods. To fully investigate the potential of our method, we also
-collect a dataset of 100 million unlabeled text images, named UTI-100M,
-covering 5 scenes and 4 languages. By leveraging hundred-million-level
-unlabeled data, our PerSec shows significant performance improvement
-when fine-tuning the learned representation on the labeled data.
-Furthermore, we observe that the representation learned by PerSec
-presents great generalization, especially under few labeled data scenes.'
-MODELS:
- Architecture:
-  - CTC
-  - Attention
-  - Transformer
- Learning Method:
-  - Self-Supervised
-  - Supervised
- Language Modality:
-  - Implicit Language Model
- Network Structure: 'architecture.png'
- FPS:
-   DEVICE: N/A
-   ITEM: N/A
- FLOPS:
-   DEVICE: N/A
-   ITEM: N/A
- PARAMS: N/A
- Results:
-   Common Benchmarks:
-     IIIT: 88.1
-     SVT: 96.7
-     IC13: 94.2
-     IC15: 73.6
-     SVTP: 77.7
-     CUTE: 72.7
-     Avg.: 83.8
-Bibtex: '@inproceedings{liu2022perceiving,
-  title={Perceiving Stroke-Semantic Context: Hierarchical Contrastive Learning for Robust Scene Text Recognition},
-  author={Liu, Hao and Wang, Bin and Bao, Zhimin and Xue, Mobai and Kang, Sheng and Jiang, Deqiang and Liu, Yinsong and Ren, Bo},
-  year={2022},
-  organization={AAAI}}'
diff --git a/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml b/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml
new file mode 100644
index 000000000..36702b15b
--- /dev/null
+++ b/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml	
@@ -0,0 +1,84 @@
+Title: 'Sequence-to-Sequence Contrastive Learning for Text Recognition'
+Abbreviation: SeqCLR
+Tasks:
+ - TextRecog
+Venue: CVPR
+Year: 2021
+Lab/Company:
+ - AWS
+URL: 'https://openaccess.thecvf.com/content/CVPR2021/html/Aberdam_Sequence-to-Sequence_Contrastive_Learning_for_Text_Recognition_CVPR_2021_paper.html'
+Paper Reading URL: 'https://mp.weixin.qq.com/s/YZiXhyhjj091b8fGC6Xduw'
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'We propose a framework for sequence-to-sequence contrastive learning
+(SeqCLR) of visual representations, which we apply to text recognition. To
+account for the sequenceto-sequence structure, each feature map is divided
+into different instances over which the contrastive loss is computed. This
+operation enables us to contrast in a sub-word level, where from each image
+we extract several positive pairs and multiple negative examples. To yield
+effective visual representations for text recognition, we further suggest
+novel augmentation heuristics, different encoder architectures and custom
+projection heads. Experiments on handwritten text and on scene text show that
+when a text decoder is trained on the learned representations, our method
+outperforms non-sequential contrastive methods. In addition, when the amount
+of supervision is reduced, SeqCLR significantly improves performance compared
+with supervised training, and when fine-tuned with 100% of the labels, our
+method achieves state-of-the-art results on standard handwritten text
+recognition benchmarks.'
+MODELS:
+ Architecture:
+  - CTC
+  - Attention
+ Learning Method:
+  - Self-Supervised
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209266442-a8133465-1eb5-4097-8098-b2d4db6923ab.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - IAM
+     - RIMES
+   Test DataSets:
+     Avg.: N/A
+     IIIT5K:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     SVT:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     IC13:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     IC15:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     SVTP:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     CUTE:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     Other Datasets:
+Bibtex: '@inproceedings{aberdam2021sequence,
+  title={Sequence-to-sequence contrastive learning for text recognition},
+  author={Aberdam, Aviad and Litman, Ron and Tsiper, Shahar and Anschel, Oron and Slossberg, Ron and Mazor, Shai and Manmatha, R and Perona, Pietro},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={15302--15312},
+  year={2021}
+}'
diff --git a/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml b/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml
new file mode 100644
index 000000000..854415dc3
--- /dev/null
+++ b/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml	
@@ -0,0 +1,86 @@
+Title: 'SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization'
+Abbreviation: SimAN
+Tasks:
+ - TextRecog
+Venue: CVPR
+Year: 2022
+Lab/Company:
+ - South China University of Technology
+URL: 'https://openaccess.thecvf.com/content/CVPR2022/html/Luo_SimAN_Exploring_Self-Supervised_Representation_Learning_of_Scene_Text_via_Similarity-Aware_CVPR_2022_paper.html'
+Paper Reading URL: 'https://mp.weixin.qq.com/s/UdEakobM85SAJ6OUU-Johg'
+Code: 'https://github.com/Canjie-Luo/Real-300K'
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+ - Dataset
+Abstract: 'Recently self-supervised representation learning has drawn
+considerable attention from the scene text recognition community. Different
+from previous studies using contrastive learning, we tackle the issue from an
+alternative perspective, i.e., by formulating the representation learning
+scheme in a generative manner. Typically, the neighboring image patches among
+one text line tend to have similar styles, including the strokes, textures,
+colors, etc. Motivated by this common sense, we augment one image patch and
+use its neighboring patch as guidance to recover itself. Specifically, we
+propose a Similarity-Aware Normalization (SimAN) module to identify the
+different patterns and align the corresponding styles from the guiding patch.
+In this way, the network gains representation capability for distinguishing
+complex patterns such as messy strokes and cluttered backgrounds. Experiments
+show that the proposed SimAN significantly improves the representation quality
+and achieves promising performance. Moreover, we surprisingly find that our
+self-supervised generative network has impressive potential for data synthesis,
+text image editing, and font interpolation, which suggests that the proposed
+SimAN has a wide range of practical applications.'
+MODELS:
+ Architecture:
+  - CTC
+  - Attention
+ Learning Method:
+  - Self-Supervised
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209343741-bd6ddbcb-6229-4f71-89ef-09ecc4bf7b65.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - ST
+   Test DataSets:
+     Avg.: N/A
+     IIIT5K:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     SVT:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     IC13:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     IC15:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     SVTP:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     CUTE:
+       WAICS: N/A
+       WAIC: N/A
+       benchmark: True
+     Other Datasets:
+Bibtex: '@inproceedings{luo2022siman,
+  title={SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization},
+  author={Luo, Canjie and Jin, Lianwen and Chen, Jingdong},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={1039--1048},
+  year={2022}
+}'

From f5d9be0a77b5b58a55176dcd285c7f30616dab09 Mon Sep 17 00:00:00 2001
From: Mountchciken <mountchicken@outlook.com>
Date: Mon, 26 Dec 2022 09:44:12 +0800
Subject: [PATCH 3/7] add one more paper

---
 ...e Learning for Scene Text Recognition.yaml | 13 ---
 ...th Masked Encoder-Decoder Pretraining.yaml | 13 ---
 ...ervised Learning for Text Recognition.yaml | 13 ---
 ... for Self-Supervised Text Recognition.yaml | 79 +++++++++++++++++++
 ...rastive Learning for Text Recognition.yaml | 13 ---
 ...xt via Similarity-Aware Normalization.yaml | 13 ---
 6 files changed, 79 insertions(+), 65 deletions(-)
 create mode 100644 paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml

diff --git a/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml b/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml
index 322eca644..d374e76ba 100644
--- a/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml	
@@ -57,29 +57,16 @@ MODELS:
      Avg.: 92.4
      IIIT5K:
        WAICS: 96.5
-       WAIC: N/A
-       benchmark: True
      SVT:
        WAICS: 94.3
-       WAIC: N/A
-       benchmark: True
      IC13:
        WAICS: 97.7
-       WAIC: N/A
-       benchmark: True
      IC15:
        WAICS: 85.4
-       WAIC: N/A
-       benchmark: True
      SVTP:
        WAICS: 89.3
-       WAIC: N/A
-       benchmark: True
      CUTE:
        WAICS: 91.3
-       WAIC: N/A
-       benchmark: True
-     Other Datasets:
 Bibtex: '@inproceedings{zhang2022context,
   title={Context-based Contrastive Learning for Scene Text Recognition},
   author={Zhang, Xinyun and Zhu, Binwu and Yao, Xufeng and Sun, Qi and Li, Ruiyu and Yu, Bei},
diff --git a/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml b/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml
index 3af6b35eb..0d47e1feb 100644
--- a/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml	
+++ b/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml	
@@ -53,29 +53,16 @@ MODELS:
      Avg.: 93.8
      IIIT5K:
        WAICS: 96.5
-       WAIC: N/A
-       benchmark: True
      SVT:
        WAICS: 94.1
-       WAIC: N/A
-       benchmark: True
      IC13:
        WAICS: 97.8
-       WAIC: N/A
-       benchmark: True
      IC15:
        WAICS: 88.7
-       WAIC: N/A
-       benchmark: True
      SVTP:
        WAICS: 90.2
-       WAIC: N/A
-       benchmark: True
      CUTE:
        WAICS: 92.7
-       WAIC: N/A
-       benchmark: True
-     Other Datasets:
 Bibtex: '@article{lyu2022maskocr,
   title={MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining},
   author={Lyu, Pengyuan and Zhang, Chengquan and Liu, Shanshan and Qiao, Meina and Xu, Yangliu and Wu, Liang and Yao, Kun and Han, Junyu and Ding, Errui and Wang, Jingdong},
diff --git a/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml b/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml
index 10e23102f..3b0cb9d17 100644
--- a/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml	
@@ -60,29 +60,16 @@ MODELS:
      Avg.: 93.3
      IIIT5K:
        WAICS: 97.3
-       WAIC: N/A
-       benchmark: True
      SVT:
        WAICS: 96.6
-       WAIC: N/A
-       benchmark: True
      IC13:
        WAICS: 97.0
-       WAIC: N/A
-       benchmark: True
      IC15:
        WAICS: 84.7
-       WAIC: N/A
-       benchmark: True
      SVTP:
        WAICS: 93.0
-       WAIC: N/A
-       benchmark: True
      CUTE:
        WAICS: 93.8
-       WAIC: N/A
-       benchmark: True
-     Other Datasets:
 Bibtex: '@article{aberdam2022multimodal,
   title={Multimodal Semi-Supervised Learning for Text Recognition},
   author={Aberdam, Aviad and Ganz, Roy and Mazor, Shai and Litman, Ron},
diff --git a/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml b/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml
new file mode 100644
index 000000000..98dcc1ce8
--- /dev/null
+++ b/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml	
@@ -0,0 +1,79 @@
+Title: 'Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition'
+Abbreviation: DiG
+Tasks:
+ - TextRecog
+Venue: ACMMM
+Year: 2022
+Lab/Company:
+ - Huazhong University of Science and Technology
+ - Huawei Inc.
+URL: 'https://dl.acm.org/doi/abs/10.1145/3503161.3547784'
+Paper Reading URL: 'https://mp.weixin.qq.com/s/BS66ezCvMrbHTAFL3sO7EQ'
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'Existing text recognition methods usually need large-scale training
+data. Most of them rely on synthetic training data due to the lack of annotated
+real images. However, there is a domain gap between the synthetic data and real
+data, which limits the performance of the text recognition models. Recent
+self-supervised text recognition methods attempted to utilize unlabeled real
+images by introducing contrastive learning, which mainly learns the
+discrimination of the text images. Inspired by the observation that humans
+learn to recognize the texts through both reading and writing, we propose to
+learn discrimination and generation by integrating contrastive learning and
+masked image modeling in our self-supervised method. The contrastive learning
+branch is adopted to learn the discrimination of text images, which imitates
+the reading behavior of humans. Meanwhile, masked image modeling is firstly
+introduced for text recognition to learn the context generation of the text
+images, which is similar to the writing behavior. The experimental results
+show that our method outperforms previous self-supervised text recognition
+methods by 10.2%-20.2% on irregular scene text recognition datasets. Moreover,
+our proposed text recognizer exceeds previous state-of-the-art text recognition
+methods by averagely 5.3% on 11 benchmarks, with similar model size. We also
+demonstrate that our pre-trained model can be easily applied to other
+text-related tasks with obvious performance gain.'
+MODELS:
+ Architecture:
+  - CTC
+  - Attention
+  - Transformer
+ Learning Method:
+  - Self-Supervised
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209343741-bd6ddbcb-6229-4f71-89ef-09ecc4bf7b65.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: 52M
+ Experiment:
+   Training DataSets:
+     - ST
+     - MJ
+     - Real
+   Test DataSets:
+     Avg.: 95.0
+     IIIT5K:
+       WAICS: 97.6
+     SVT:
+       WAICS: 96.5
+     IC13:
+       WAICS: 97.6
+     IC15:
+       WAICS: 88.9
+     SVTP:
+       WAICS: 92.9
+     CUTE:
+       WAICS: 96.5
+Bibtex: '@inproceedings{yang2022reading,
+  title={Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition},
+  author={Yang, Mingkun and Liao, Minghui and Lu, Pu and Wang, Jing and Zhu, Shenggao and Luo, Hualin and Tian, Qi and Bai, Xiang},
+  booktitle={Proceedings of the 30th ACM International Conference on Multimedia},
+  pages={4214--4223},
+  year={2022}
+}'
diff --git a/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml b/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml
index 36702b15b..44fa9a7d2 100644
--- a/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml	
@@ -52,29 +52,16 @@ MODELS:
      Avg.: N/A
      IIIT5K:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      SVT:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      IC13:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      IC15:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      SVTP:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      CUTE:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
-     Other Datasets:
 Bibtex: '@inproceedings{aberdam2021sequence,
   title={Sequence-to-sequence contrastive learning for text recognition},
   author={Aberdam, Aviad and Litman, Ron and Tsiper, Shahar and Anschel, Oron and Slossberg, Ron and Mazor, Shai and Manmatha, R and Perona, Pietro},
diff --git a/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml b/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml
index 854415dc3..9e33e12a3 100644
--- a/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml	
+++ b/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml	
@@ -54,29 +54,16 @@ MODELS:
      Avg.: N/A
      IIIT5K:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      SVT:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      IC13:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      IC15:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      SVTP:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
      CUTE:
        WAICS: N/A
-       WAIC: N/A
-       benchmark: True
-     Other Datasets:
 Bibtex: '@inproceedings{luo2022siman,
   title={SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization},
   author={Luo, Canjie and Jin, Lianwen and Chen, Jingdong},

From af5f15027fd3e1fff9a83ba3079a4e1310a41f27 Mon Sep 17 00:00:00 2001
From: Mountchciken <mountchicken@outlook.com>
Date: Mon, 26 Dec 2022 10:10:08 +0800
Subject: [PATCH 4/7] add five paper

---
 ...ition with Text Semantic Segmentation.yaml | 74 ++++++++++++++++
 ...guage Model in Scene Text Recognition.yaml | 85 +++++++++++++++++++
 ...ed Experts for Scene Text Recognition.yaml | 79 +++++++++++++++++
 ... for Self-Supervised Text Recognition.yaml |  2 +-
 ...Shortcut Learning in Text Recognizers.yaml | 75 ++++++++++++++++
 5 files changed, 314 insertions(+), 1 deletion(-)
 create mode 100644 paper_zoo/textrecog/Background-Insensitive Scene Text Recognition with Text Semantic Segmentation.yaml
 create mode 100644 paper_zoo/textrecog/PETR: Rethinking the Capability of Transformer-Based Language Model in Scene Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/Pure Transformer with Integrated Experts for Scene Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/TextAdaIN: Paying Attention to Shortcut Learning in Text Recognizers.yaml

diff --git a/paper_zoo/textrecog/Background-Insensitive Scene Text Recognition with Text Semantic Segmentation.yaml b/paper_zoo/textrecog/Background-Insensitive Scene Text Recognition with Text Semantic Segmentation.yaml
new file mode 100644
index 000000000..0c96515cd
--- /dev/null
+++ b/paper_zoo/textrecog/Background-Insensitive Scene Text Recognition with Text Semantic Segmentation.yaml	
@@ -0,0 +1,74 @@
+Title: 'Background-Insensitive Scene Text Recognition with Text Semantic Segmentation'
+Abbreviation: BINet
+Tasks:
+ - TextRecog
+Venue: ECCV
+Year: 2022
+Lab/Company:
+ - University of South Carolina, Columbia, SC 29201, USA
+URL: 'https://link.springer.com/chapter/10.1007/978-3-031-19806-9_10'
+Paper Reading URL: N/A
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'Scene Text Recognition (STR) has many important applications in
+computer vision. Complex backgrounds continue to be a big challenge for STR
+because they interfere with text feature extraction. Many existing methods
+use attentional regions, bounding boxes or polygons to reduce such
+interference. However, the text regions located by these methods still contain
+much undesirable background interference. In this paper, we propose a
+Background-Insensitive approach BINet by explicitly leveraging the text
+Semantic Segmentation (SSN) to extract texts more accurately. SSN is trained
+on a set of existing segmentation data, whose volume is only 0.03% of STR
+training data. This prevents the large-scale pixel-level annotations of the
+ STR training data. To effectively utilize the segmentation cues, we design new
+ segmentation refinement and embedding blocks for refining text-masks and
+ reinforcing visual features. Additionally, we propose an efficient pipeline
+ that utilizes Synthetic Initialization (SI) for STR models trained only on
+ real data (1.7% of STR training data), instead of on both synthetic and real
+ data from scratch. Experiments show that the proposed method can recognize
+ text from complex backgrounds more effectively, achieving state-of-the-art
+ performance on several public datasets.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209490055-2aca52d4-9072-4ce4-a256-7fc8b953f59e.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - ST
+     - MJ
+     - Real
+   Test DataSets:
+     Avg.: 94.4
+     IIIT5K:
+       WAICS: 97.3
+     SVT:
+       WAICS: 96.4
+     IC13:
+       WAICS: 96.8
+     IC15:
+       WAICS: 89.2
+     SVTP:
+       WAICS: 89.9
+     CUTE:
+       WAICS: 95.8
+Bibtex: '@inproceedings{zhao2022background,
+  title={Background-Insensitive Scene Text Recognition with Text Semantic Segmentation},
+  author={Zhao, Liang and Wu, Zhenyao and Wu, Xinyi and Wilsbacher, Greg and Wang, Song},
+  booktitle={European Conference on Computer Vision},
+  pages={163--182},
+  year={2022},
+  organization={Springer}
+}'
diff --git a/paper_zoo/textrecog/PETR: Rethinking the Capability of Transformer-Based Language Model in Scene Text Recognition.yaml b/paper_zoo/textrecog/PETR: Rethinking the Capability of Transformer-Based Language Model in Scene Text Recognition.yaml
new file mode 100644
index 000000000..751bf9a8a
--- /dev/null
+++ b/paper_zoo/textrecog/PETR: Rethinking the Capability of Transformer-Based Language Model in Scene Text Recognition.yaml	
@@ -0,0 +1,85 @@
+Title: 'Pure Transformer with Integrated Experts for Scene Text Recognition'
+Abbreviation: PETR
+Tasks:
+ - TextRecog
+Venue: TIP
+Year: 2022
+Lab/Company:
+ - University of Science and Technology of China
+URL: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_28'
+Paper Reading URL: N/A
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'The exploration of linguistic information promotes the development of
+scene text recognition task. Benefiting from the significance in parallel
+reasoning and global relationship capture, transformer-based language model
+(TLM) has achieved dominant performance recently. As a decoupled structure
+from the recognition process, we argue that TLM’s capability is limited by the
+input low-quality visual prediction. To be specific: 1) The visual prediction
+with low character-wise accuracy increases the correction burden of TLM. 2)
+The inconsistent word length between visual prediction and original image
+provides a wrong language modeling guidance in TLM. In this paper, we propose
+a Progressive scEne Text Recognizer (PETR) to improve the capability of
+transformer-based language model by handling above two problems. Firstly, a
+Destruction Learning Module (DLM) is proposed to consider the linguistic
+information in the visual context. DLM introduces the recognition of destructed
+images with disordered patches in the training stage. Through guiding the
+vision model to restore patch orders and make word-level prediction on the
+destructed images, visual prediction with high character-wise accuracy is
+obtained by exploring inner relationship between the local visual patches.
+Secondly, a new Language Rectification Module (LRM) is proposed to optimize
+the word length for language guidance rectification. Through progressively
+implementing LRM in different language modeling steps, a novel progressive
+rectification network is constructed to handle some extremely challenging
+cases (e.g. distortion, occlusion, etc.). By utilizing DLM and LRM, PETR
+enhances the capability of transformer-based language model from a more
+general aspect, that is, focusing on the reduction of correction burden and
+rectification of language modeling guidance. Compared with parallel
+transformer-based methods, PETR obtains 1.0% and 0.8% improvement on regular
+and irregular datasets respectively while introducing only 1.7M additional
+parameters. The extensive experiments on both English and Chinese benchmarks
+demonstrate that PETR achieves the state-of-the-art results.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Explicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209489701-073cdf37-5990-4bcf-8aa8-434255fd568e.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - ST
+     - MJ
+   Test DataSets:
+     Avg.: 90.8
+     IIIT5K:
+       WAICS: 95.8
+     SVT:
+       WAICS: 92.4
+     IC13:
+       WAICS: 97.0
+     IC15:
+       WAICS: 83.3
+     SVTP:
+       WAICS: 86.2
+     CUTE:
+       WAICS: 89.9
+Bibtex: '@article{wang2022petr,
+  title={PETR: Rethinking the Capability of Transformer-Based Language Model in Scene Text Recognition},
+  author={Wang, Yuxin and Xie, Hongtao and Fang, Shancheng and Xing, Mengting and Wang, Jing and Zhu, Shenggao and Zhang, Yongdong},
+  journal={IEEE Transactions on Image Processing},
+  volume={31},
+  pages={5585--5598},
+  year={2022},
+  publisher={IEEE}
+}'
diff --git a/paper_zoo/textrecog/Pure Transformer with Integrated Experts for Scene Text Recognition.yaml b/paper_zoo/textrecog/Pure Transformer with Integrated Experts for Scene Text Recognition.yaml
new file mode 100644
index 000000000..04e996341
--- /dev/null
+++ b/paper_zoo/textrecog/Pure Transformer with Integrated Experts for Scene Text Recognition.yaml	
@@ -0,0 +1,79 @@
+Title: 'Pure Transformer with Integrated Experts for Scene Text Recognition'
+Abbreviation: PTIE
+Tasks:
+ - TextRecog
+Venue: ECCV
+Year: 2022
+Lab/Company:
+ - Nanyang Technological University, Singapore
+ - Institute for Infocomm Research, A*STAR, Singapore
+URL: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_28'
+Paper Reading URL: N/A
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'Scene text recognition (STR) involves the task of reading text in
+cropped images of natural scenes. Conventional models in STR employ
+convolutional neural network (CNN) followed by recurrent neural network in an
+encoder-decoder framework. In recent times, the transformer architecture is
+being widely adopted in STR as it shows strong capability in capturing
+long-term dependency which appears to be prominent in scene text images. Many
+researchers utilized transformer as part of a hybrid CNN-transformer encoder,
+often followed by a transformer decoder. However, such methods only make use
+of the long-term dependency mid-way through the encoding process. Although the
+vision transformer (ViT) is able to capture such dependency at an early stage,
+its utilization remains largely unexploited in STR. This work proposes the use
+of a transformer-only model as a simple baseline which outperforms hybrid
+CNN-transformer models. Furthermore, two key areas for improvement were
+identified. Firstly, the first decoded character has the lowest prediction
+accuracy. Secondly, images of different original aspect ratios react
+differently to the patch resolutions while ViT only employ one fixed patch
+resolution. To explore these areas, Pure Transformer with Integrated Experts
+(PTIE) is proposed. PTIE is a transformer model that can process multiple patch
+resolutions and decode in both the original and reverse character orders. It is
+examined on 7 commonly used benchmarks and compared with over 20
+state-of-the-art methods. The experimental results show that the proposed
+method outperforms them and obtains state-of-the-art results in most
+benchmarks.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209489370-a70ecae3-2397-44e6-94fa-c3f25b32754b.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - ST
+     - MJ
+   Test DataSets:
+     Avg.: 93.0
+     IIIT5K:
+       WAICS: 96.3
+     SVT:
+       WAICS: 94.9
+     IC13:
+       WAICS: 97.2
+     IC15:
+       WAICS: 87.8
+     SVTP:
+       WAICS: 90.1
+     CUTE:
+       WAICS: 91.7
+Bibtex: '@inproceedings{tan2022pure,
+  title={Pure Transformer with Integrated Experts for Scene Text Recognition},
+  author={Tan, Yew Lee and Kong, Adams Wai-Kin and Kim, Jung-Jae},
+  booktitle={European Conference on Computer Vision},
+  pages={481--497},
+  year={2022},
+  organization={Springer}
+}'
diff --git a/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml b/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml
index 98dcc1ce8..e3120e87d 100644
--- a/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml	
@@ -43,7 +43,7 @@ MODELS:
   - Supervised
  Language Modality:
   - Implicit Language Model
- Network Structure: 'https://user-images.githubusercontent.com/65173622/209343741-bd6ddbcb-6229-4f71-89ef-09ecc4bf7b65.png'
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209489340-906c7cc5-3412-4fa4-99e7-48f39d66b91c.png'
  FPS:
    DEVICE: N/A
    ITEM: N/A
diff --git a/paper_zoo/textrecog/TextAdaIN: Paying Attention to Shortcut Learning in Text Recognizers.yaml b/paper_zoo/textrecog/TextAdaIN: Paying Attention to Shortcut Learning in Text Recognizers.yaml
new file mode 100644
index 000000000..07818f08f
--- /dev/null
+++ b/paper_zoo/textrecog/TextAdaIN: Paying Attention to Shortcut Learning in Text Recognizers.yaml	
@@ -0,0 +1,75 @@
+Title: 'TextAdaIN: Paying Attention to Shortcut Learning in Text Recognizers'
+Abbreviation: TextAdaIN
+Tasks:
+ - TextRecog
+Venue: ECCV
+Year: 2022
+Lab/Company:
+ - AWS AI Labs
+URL: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_25'
+Paper Reading URL: N/A
+Code: 'https://github.com/ amazon-research/textadain-robust-recognition'
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'Leveraging the characteristics of convolutional layers, neural
+networks are extremely effective for pattern recognition tasks. However in some
+cases, their decisions are based on unintended information leading to high
+performance on standard benchmarks but also to a lack of generalization to
+challenging testing conditions and unintuitive failures. Recent work has termed
+this ”shortcut learning” and addressed its presence in multiple domains. In
+text recognition, we reveal another such shortcut, whereby recognizers overly
+depend on local image statistics. Motivated by this, we suggest an approach to
+regulate the reliance on local statistics that improves text recognition
+performance. Our method, termed TextAdaIN, creates local distortions in the
+feature map which prevent the network from overfitting to local statistics.
+It does so by viewing each feature map as a sequence of elements and
+deliberately mismatching fine-grained feature statistics between elements in a
+mini-batch. Despite TextAdaIN’s simplicity, extensive experiments show its
+effectiveness compared to other, more complicated methods. TextAdaIN achieves
+state-of-the-art results on standard handwritten text recognition benchmarks.
+It generalizes to multiple architectures and to the domain of scene text
+recognition. Furthermore, we demonstrate that integrating TextAdaIN improves
+robustness towards more challenging testing conditions.'
+MODELS:
+ Architecture:
+  - CTC
+  - Attention
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209490313-d4548816-434f-4df5-8f70-616fa129322f.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - ST
+     - MJ
+   Test DataSets:
+     Avg.: N/A
+     IIIT5K:
+       WAICS: N/A
+     SVT:
+       WAICS: N/A
+     IC13:
+       WAICS: N/A
+     IC15:
+       WAICS: N/A
+     SVTP:
+       WAICS: N/A
+     CUTE:
+       WAICS: N/A
+Bibtex: '@inproceedings{nuriel2022textadain,
+  title={TextAdaIN: Paying attention to shortcut learning in text recognizers},
+  author={Nuriel, Oren and Fogel, Sharon and Litman, Ron},
+  booktitle={European Conference on Computer Vision},
+  pages={427--445},
+  year={2022},
+  organization={Springer}
+}'

From e2eba51c1c03fceb58f4e0e5731facbeb8df37b5 Mon Sep 17 00:00:00 2001
From: Mountchciken <mountchicken@outlook.com>
Date: Mon, 26 Dec 2022 11:12:13 +0800
Subject: [PATCH 5/7] update paper url

---
 ...Text Recognition with Text Semantic Segmentation.yaml | 4 +++-
 ... Contrastive Learning for Scene Text Recognition.yaml | 4 +++-
 ...ognition with Masked Encoder-Decoder Pretraining.yaml | 9 +++++----
 ...al Semi-Supervised Learning for Text Recognition.yaml | 3 ++-
 ...r-Based Language Model in Scene Text Recognition.yaml | 4 +++-
 ...th Integrated Experts for Scene Text Recognition.yaml | 4 +++-
 ...ve Modeling for Self-Supervised Text Recognition.yaml | 4 +++-
 ...quence Contrastive Learning for Text Recognition.yaml | 4 +++-
 ...of Scene Text via Similarity-Aware Normalization.yaml | 4 +++-
 ...tention to Shortcut Learning in Text Recognizers.yaml | 4 +++-
 10 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/paper_zoo/textrecog/Background-Insensitive Scene Text Recognition with Text Semantic Segmentation.yaml b/paper_zoo/textrecog/Background-Insensitive Scene Text Recognition with Text Semantic Segmentation.yaml
index 0c96515cd..903c63071 100644
--- a/paper_zoo/textrecog/Background-Insensitive Scene Text Recognition with Text Semantic Segmentation.yaml	
+++ b/paper_zoo/textrecog/Background-Insensitive Scene Text Recognition with Text Semantic Segmentation.yaml	
@@ -6,7 +6,9 @@ Venue: ECCV
 Year: 2022
 Lab/Company:
  - University of South Carolina, Columbia, SC 29201, USA
-URL: 'https://link.springer.com/chapter/10.1007/978-3-031-19806-9_10'
+URL:
+  Venue: 'https://link.springer.com/chapter/10.1007/978-3-031-19806-9_10'
+  Arxiv: 'https://www.cse.sc.edu/~songwang/document/eccv22c.pdf'
 Paper Reading URL: N/A
 Code: N/A
 Supported In MMOCR: N/S
diff --git a/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml b/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml
index d374e76ba..d1dced305 100644
--- a/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Context-based Contrastive Learning for Scene Text Recognition.yaml	
@@ -7,7 +7,9 @@ Year: 2022
 Lab/Company:
  - The Chinese University of Hong Kong
  - SmartMore
-URL: 'https://www.aaai.org/AAAI22Papers/AAAI-10147.ZhangX.pdf'
+URL:
+  Venue: 'https://www.aaai.org/AAAI22Papers/AAAI-10147.ZhangX.pdf'
+  Arxiv: 'http://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf'
 Paper Reading URL: 'https://mp.weixin.qq.com/s/7ayYKALDc3-nsBgEJG-D2A'
 Code: N/A
 Supported In MMOCR: N/S
diff --git a/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml b/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml
index 0d47e1feb..512496f88 100644
--- a/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml	
+++ b/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml	
@@ -6,9 +6,10 @@ Venue: arXiv
 Year: 2022
 Lab/Company:
  - Department of Computer Vision Technology (VIS), Baidu Inc.
-URL: 'https://openaccess.thecvf.com/content/CVPR2022/html/Luo_SimAN_Exploring_Self-Supervised_Representation_Learning_of_Scene_Text_via_Similarity-Aware_CVPR_2022_paper.html'
-Paper Reading URL: 'https://mp.weixin.qq.com/s/UdEakobM85SAJ6OUU-Johg'
-Code: 'https://github.com/Canjie-Luo/Real-300K'
+URL:
+  Arxiv: 'https://arxiv.org/abs/2206.00311'
+Paper Reading URL: N/A
+Code: N/A
 Supported In MMOCR: N/S
 PaperType:
  - Algorithm
@@ -36,7 +37,7 @@ MODELS:
   - Supervised
  Language Modality:
   - Implicit Language Model
- Network Structure: 'https://user-images.githubusercontent.com/65173622/209343741-bd6ddbcb-6229-4f71-89ef-09ecc4bf7b65.png'
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/209494710-489fe94b-d550-4c5e-bdab-24590a3c3fe2.png'
  FPS:
    DEVICE: N/A
    ITEM: N/A
diff --git a/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml b/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml
index 3b0cb9d17..2569358e6 100644
--- a/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml	
@@ -6,7 +6,8 @@ Venue: arXiv
 Year: 2022
 Lab/Company:
  - AWS AI Labs
-URL: 'https://arxiv.org/abs/2211.04785'
+URL:
+  Arxiv: 'https://arxiv.org/abs/2211.04785'
 Paper Reading URL: N/A
 Code: N/A
 Supported In MMOCR: N/S
diff --git a/paper_zoo/textrecog/PETR: Rethinking the Capability of Transformer-Based Language Model in Scene Text Recognition.yaml b/paper_zoo/textrecog/PETR: Rethinking the Capability of Transformer-Based Language Model in Scene Text Recognition.yaml
index 751bf9a8a..c7252df91 100644
--- a/paper_zoo/textrecog/PETR: Rethinking the Capability of Transformer-Based Language Model in Scene Text Recognition.yaml	
+++ b/paper_zoo/textrecog/PETR: Rethinking the Capability of Transformer-Based Language Model in Scene Text Recognition.yaml	
@@ -6,7 +6,9 @@ Venue: TIP
 Year: 2022
 Lab/Company:
  - University of Science and Technology of China
-URL: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_28'
+URL:
+  Venue: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_28'
+  Arxiv: 'https://ui.adsabs.harvard.edu/abs/2022arXiv221104963T/abstract'
 Paper Reading URL: N/A
 Code: N/A
 Supported In MMOCR: N/S
diff --git a/paper_zoo/textrecog/Pure Transformer with Integrated Experts for Scene Text Recognition.yaml b/paper_zoo/textrecog/Pure Transformer with Integrated Experts for Scene Text Recognition.yaml
index 04e996341..13ea624b5 100644
--- a/paper_zoo/textrecog/Pure Transformer with Integrated Experts for Scene Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Pure Transformer with Integrated Experts for Scene Text Recognition.yaml	
@@ -7,7 +7,9 @@ Year: 2022
 Lab/Company:
  - Nanyang Technological University, Singapore
  - Institute for Infocomm Research, A*STAR, Singapore
-URL: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_28'
+URL:
+  Venue: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_28'
+  Arxiv: 'https://ui.adsabs.harvard.edu/abs/2022arXiv221104963T/abstract'
 Paper Reading URL: N/A
 Code: N/A
 Supported In MMOCR: N/S
diff --git a/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml b/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml
index e3120e87d..e10a37027 100644
--- a/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Reading and Writing: Discriminative and Generative Modeling for Self-Supervised Text Recognition.yaml	
@@ -7,7 +7,9 @@ Year: 2022
 Lab/Company:
  - Huazhong University of Science and Technology
  - Huawei Inc.
-URL: 'https://dl.acm.org/doi/abs/10.1145/3503161.3547784'
+URL:
+  Venue: 'https://dl.acm.org/doi/abs/10.1145/3503161.3547784'
+  Arxiv: 'https://arxiv.org/abs/2207.00193'
 Paper Reading URL: 'https://mp.weixin.qq.com/s/BS66ezCvMrbHTAFL3sO7EQ'
 Code: N/A
 Supported In MMOCR: N/S
diff --git a/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml b/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml
index 44fa9a7d2..23399db90 100644
--- a/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Sequence-to-Sequence Contrastive Learning for Text Recognition.yaml	
@@ -6,7 +6,9 @@ Venue: CVPR
 Year: 2021
 Lab/Company:
  - AWS
-URL: 'https://openaccess.thecvf.com/content/CVPR2021/html/Aberdam_Sequence-to-Sequence_Contrastive_Learning_for_Text_Recognition_CVPR_2021_paper.html'
+URL:
+  Venue: 'https://openaccess.thecvf.com/content/CVPR2021/html/Aberdam_Sequence-to-Sequence_Contrastive_Learning_for_Text_Recognition_CVPR_2021_paper.html'
+  Arxiv: 'https://arxiv.org/abs/2012.10873'
 Paper Reading URL: 'https://mp.weixin.qq.com/s/YZiXhyhjj091b8fGC6Xduw'
 Code: N/A
 Supported In MMOCR: N/S
diff --git a/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml b/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml
index 9e33e12a3..0f51f600c 100644
--- a/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml	
+++ b/paper_zoo/textrecog/SimAN: Exploring Self-Supervised Representation Learning of Scene Text via Similarity-Aware Normalization.yaml	
@@ -6,7 +6,9 @@ Venue: CVPR
 Year: 2022
 Lab/Company:
  - South China University of Technology
-URL: 'https://openaccess.thecvf.com/content/CVPR2022/html/Luo_SimAN_Exploring_Self-Supervised_Representation_Learning_of_Scene_Text_via_Similarity-Aware_CVPR_2022_paper.html'
+URL:
+  Venue: 'https://openaccess.thecvf.com/content/CVPR2022/html/Luo_SimAN_Exploring_Self-Supervised_Representation_Learning_of_Scene_Text_via_Similarity-Aware_CVPR_2022_paper.html'
+  Arxiv: 'https://arxiv.org/abs/2203.10492'
 Paper Reading URL: 'https://mp.weixin.qq.com/s/UdEakobM85SAJ6OUU-Johg'
 Code: 'https://github.com/Canjie-Luo/Real-300K'
 Supported In MMOCR: N/S
diff --git a/paper_zoo/textrecog/TextAdaIN: Paying Attention to Shortcut Learning in Text Recognizers.yaml b/paper_zoo/textrecog/TextAdaIN: Paying Attention to Shortcut Learning in Text Recognizers.yaml
index 07818f08f..04833b40f 100644
--- a/paper_zoo/textrecog/TextAdaIN: Paying Attention to Shortcut Learning in Text Recognizers.yaml	
+++ b/paper_zoo/textrecog/TextAdaIN: Paying Attention to Shortcut Learning in Text Recognizers.yaml	
@@ -6,7 +6,9 @@ Venue: ECCV
 Year: 2022
 Lab/Company:
  - AWS AI Labs
-URL: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_25'
+URL:
+  Venue: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_25'
+  Arxiv: 'https://arxiv.org/abs/2105.03906'
 Paper Reading URL: N/A
 Code: 'https://github.com/ amazon-research/textadain-robust-recognition'
 Supported In MMOCR: N/S

From 35c06f1f23aa2f75b4068cd5881f7aa7f5a46c56 Mon Sep 17 00:00:00 2001
From: Mountchciken <mountchicken@outlook.com>
Date: Fri, 6 Jan 2023 13:06:54 +0800
Subject: [PATCH 6/7] fix one paper

---
 ...Text Recognition with Masked Encoder-Decoder Pretraining.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml b/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml
index 512496f88..1d6b34bd4 100644
--- a/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml	
+++ b/paper_zoo/textrecog/MaskOCR: Text Recognition with Masked Encoder-Decoder Pretraining.yaml	
@@ -7,6 +7,7 @@ Year: 2022
 Lab/Company:
  - Department of Computer Vision Technology (VIS), Baidu Inc.
 URL:
+  Venue: N/A
   Arxiv: 'https://arxiv.org/abs/2206.00311'
 Paper Reading URL: N/A
 Code: N/A

From 5cddf5a552e59680e2f838082613558306e413ce Mon Sep 17 00:00:00 2001
From: Mountchciken <mountchicken@outlook.com>
Date: Wed, 18 Jan 2023 19:50:32 +0800
Subject: [PATCH 7/7] update

---
 ...Multimodal Semi-Supervised Learning for Text Recognition.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml b/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml
index 2569358e6..4422292be 100644
--- a/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml	
+++ b/paper_zoo/textrecog/Multimodal Semi-Supervised Learning for Text Recognition.yaml	
@@ -41,6 +41,7 @@ MODELS:
   - Attention
  Learning Method:
   - Self-Supervised
+  - Semi-Supervised
   - Supervised
  Language Modality:
   - Implicit Language Model