diff --git a/models/text_recognition_crnn/README.md b/models/text_recognition_crnn/README.md index 4b6cea7c..f6046e96 100644 --- a/models/text_recognition_crnn/README.md +++ b/models/text_recognition_crnn/README.md @@ -2,11 +2,24 @@ An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition +Results of accuracy evaluation with [tools/eval](../../tools/eval) at different text recognition datasets. + +| Model name | ICDAR03(%) | IIIT5k(%) | CUTE80(%) | +|--------------|------------|-----------|-----------| +| CRNN_EN | 81.66 | 74.33 | 52.78 | +| CRNN_EN_FP16 | 82.01 | 74.93 | 52.34 | +| CRNN_CH | 71.28 | 80.90 | 67.36 | +| CRNN_CH_FP16 | 78.63 | 80.93 | 67.01 | + +\*: 'FP16' stands for 'model quantized into FP16'. + Note: - Model source: - `text_recognition_CRNN_EN_2021sep.onnx`: https://docs.opencv.org/4.5.2/d9/d1e/tutorial_dnn_OCR.html (CRNN_VGG_BiLSTM_CTC.onnx) + - `text_recognition_CRNN_CH_2021sep.onnx`: https://docs.opencv.org/4.x/d4/d43/tutorial_dnn_text_spotting.html (crnn_cs.onnx) - `text_recognition_CRNN_CN_2021nov.onnx`: https://docs.opencv.org/4.5.2/d4/d43/tutorial_dnn_text_spotting.html (crnn_cs_CN.onnx) - `text_recognition_CRNN_EN_2021sep.onnx` can detect digits (0\~9) and letters (return lowercase letters a\~z) (view `charset_36_EN.txt` for details). +- `text_recognition_CRNN_CH_2021sep.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), and some special characters (view `charset_94_CH.txt` for details). - `text_recognition_CRNN_CN_2021nov.onnx` can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), some Chinese characters and some special characters (view `charset_3944_CN.txt` for details). - For details on training this model series, please visit https://github.com/zihaomu/deep-text-recognition-benchmark. @@ -16,6 +29,7 @@ Note: - This demo uses [text_detection_db](../text_detection_db) as text detector. - Selected model must match with the charset: - Try `text_recognition_CRNN_EN_2021sep.onnx` with `charset_36_EN.txt`. + - Try `text_recognition_CRNN_CH_2021sep.onnx` with `charset_94_CH.txt` - Try `text_recognition_CRNN_CN_2021sep.onnx` with `charset_3944_CN.txt`. Run the demo detecting English: diff --git a/models/text_recognition_crnn/charset_94_CH.txt b/models/text_recognition_crnn/charset_94_CH.txt new file mode 100644 index 00000000..87c6d678 --- /dev/null +++ b/models/text_recognition_crnn/charset_94_CH.txt @@ -0,0 +1,94 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +^ +_ +` +{ +| +} +~ diff --git a/models/text_recognition_crnn/crnn.py b/models/text_recognition_crnn/crnn.py index a35097b1..c136fe4e 100644 --- a/models/text_recognition_crnn/crnn.py +++ b/models/text_recognition_crnn/crnn.py @@ -54,7 +54,9 @@ def _preprocess(self, image, rbbox): rotationMatrix = cv.getPerspectiveTransform(vertices, self._targetVertices) cropped = cv.warpPerspective(image, rotationMatrix, self._inputSize) - if 'CN' in self._model_path: + # 'CN' can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), and some special characters + # 'CH' can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), some Chinese characters and some special characters + if 'CN' in self._model_path or 'CH' in self._model_path: pass else: cropped = cv.cvtColor(cropped, cv.COLOR_BGR2GRAY) diff --git a/models/text_recognition_crnn/text_recognition_CRNN_CH_2021sep.onnx b/models/text_recognition_crnn/text_recognition_CRNN_CH_2021sep.onnx new file mode 100644 index 00000000..e69de29b diff --git a/tools/eval/README.md b/tools/eval/README.md index fc8cfd44..b754c470 100644 --- a/tools/eval/README.md +++ b/tools/eval/README.md @@ -19,6 +19,8 @@ Supported datasets: - [ImageNet](#imagenet) - [WIDERFace](#widerface) - [LFW](#lfw) +- [ICDAR](#icdar) +- [IIIT5K](#iiit5k) ## ImageNet @@ -137,4 +139,55 @@ Run evaluation with the following command: ```shell python eval.py -m sface -d lfw -dr /path/to/lfw +``` + +## ICDAR2003 + +### Prepare data + +Please visit http://iapr-tc11.org/mediawiki/index.php/ICDAR_2003_Robust_Reading_Competitions to download the ICDAR2003 dataset and the labels. + +```shell +$ tree -L 2 /path/to/icdar +. +├── word +│   ├── 1 +│ │ ├── self +│ │ ├── ... +│ │ └── willcooks +│   ├── ... +│   └── 12 +└── word.xml +    +``` + +### Evaluation + +Run evaluation with the following command: + +```shell +python eval.py -m crnn -d icdar -dr /path/to/icdar +``` + +### Example + +```shell +download zip file from http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTrain/word.zip +upzip file to /path/to/icdar +python eval.py -m crnn -d icdar -dr /path/to/icdar +``` + +## IIIT5K + +### Prepare data + +Please visit https://github.com/cv-small-snails/Text-Recognition-Material to download the IIIT5K dataset and the labels. + +### Evaluation + +All the datasets in the format of lmdb can be evaluated by this script.
+Run evaluation with the following command: + +```shell +python eval.py -m crnn -d iiit5k -dr /path/to/iiit5k ``` \ No newline at end of file diff --git a/tools/eval/datasets/__init__.py b/tools/eval/datasets/__init__.py index d650cd28..84388375 100644 --- a/tools/eval/datasets/__init__.py +++ b/tools/eval/datasets/__init__.py @@ -1,6 +1,8 @@ from .imagenet import ImageNet from .widerface import WIDERFace from .lfw import LFW +from .icdar import ICDAR +from .iiit5k import IIIT5K class Registery: def __init__(self, name): @@ -16,4 +18,6 @@ def register(self, item): DATASETS = Registery("Datasets") DATASETS.register(ImageNet) DATASETS.register(WIDERFace) -DATASETS.register(LFW) \ No newline at end of file +DATASETS.register(LFW) +DATASETS.register(ICDAR) +DATASETS.register(IIIT5K) \ No newline at end of file diff --git a/tools/eval/datasets/icdar.py b/tools/eval/datasets/icdar.py new file mode 100644 index 00000000..bc2c61e5 --- /dev/null +++ b/tools/eval/datasets/icdar.py @@ -0,0 +1,53 @@ +import os +import numpy as np +import cv2 as cv +import xml.dom.minidom as minidom +from tqdm import tqdm + +class ICDAR: + def __init__(self, root): + self.root = root + self.acc = -1 + self.inputSize = [100, 32] + self.val_label_file = os.path.join(root, "word.xml") + self.val_label = self.load_label(self.val_label_file) + + @property + def name(self): + return self.__class__.__name__ + + def load_label(self, label_file): + label = list() + dom = minidom.getDOMImplementation().createDocument(None, 'Root', None) + root = dom.documentElement + dom = minidom.parse(self.val_label_file) + root = dom.documentElement + names = root.getElementsByTagName('image') + for name in names: + key = os.path.join(self.root, name.getAttribute('file')) + value = name.getAttribute('tag').lower() + label.append([key, value]) + + return label + + def eval(self, model): + right_num = 0 + pbar = tqdm(self.val_label) + for fn, label in pbar: + pbar.set_description("Evaluating {} with {} val set".format(model.name, self.name)) + + img = cv.imread(fn) + + rbbox = np.array([0, img.shape[0], 0, 0, img.shape[1], 0, img.shape[1], img.shape[0]]) + pred = model.infer(img, rbbox) + if label == pred: + right_num += 1 + + self.acc = right_num/(len(self.val_label) * 1.0) + + + def get_result(self): + return self.acc + + def print_result(self): + print("Accuracy: {:.2f}%".format(self.acc*100)) \ No newline at end of file diff --git a/tools/eval/datasets/iiit5k.py b/tools/eval/datasets/iiit5k.py new file mode 100644 index 00000000..07621376 --- /dev/null +++ b/tools/eval/datasets/iiit5k.py @@ -0,0 +1,55 @@ +import lmdb +import os +import numpy as np +import cv2 as cv +from tqdm import tqdm + +class IIIT5K: + def __init__(self, root): + self.root = root + self.acc = -1 + self.inputSize = [100, 32] + + self.val_label = self.load_label(self.root) + + @property + def name(self): + return self.__class__.__name__ + + def load_label(self, root): + lmdb_file = root + lmdb_env = lmdb.open(lmdb_file) + lmdb_txn = lmdb_env.begin() + lmdb_cursor = lmdb_txn.cursor() + label = list() + for key, value in lmdb_cursor: + image_index = key.decode() + if image_index.split('-')[0] == 'image': + img = cv.imdecode(np.fromstring(value, np.uint8), 3) + label_index = 'label-' + image_index.split('-')[1] + value = lmdb_txn.get(label_index.encode()).decode().lower() + label.append([img, value]) + else: + break + return label + + def eval(self, model): + right_num = 0 + pbar = tqdm(self.val_label) + for img, value in pbar: + pbar.set_description("Evaluating {} with {} val set".format(model.name, self.name)) + + + rbbox = np.array([0, img.shape[0], 0, 0, img.shape[1], 0, img.shape[1], img.shape[0]]) + pred = model.infer(img, rbbox).lower() + if value == pred: + right_num += 1 + + self.acc = right_num/(len(self.val_label) * 1.0) + + + def get_result(self): + return self.acc + + def print_result(self): + print("Accuracy: {:.2f}%".format(self.acc*100)) \ No newline at end of file diff --git a/tools/eval/eval.py b/tools/eval/eval.py index 16800925..9378cc38 100644 --- a/tools/eval/eval.py +++ b/tools/eval/eval.py @@ -73,6 +73,11 @@ name="SFace", topic="face_recognition", modelPath=os.path.join(root_dir, "models/face_recognition_sface/face_recognition_sface_2021dec-act_int8-wt_int8-quantized.onnx")), + crnn=dict( + name="CRNN", + topic="text_recognition", + modelPath=os.path.join(root_dir, "models/text_recognition_crnn/text_recognition_CRNN_EN_2021sep.onnx"), + charsetPath=os.path.join(root_dir, "models/text_recognition_crnn/charset_36_EN.txt")), ) datasets = dict( @@ -87,6 +92,12 @@ name="LFW", topic="face_recognition", target_size=112), + icdar=dict( + name="ICDAR", + topic="text_recognition"), + iiit5k=dict( + name="IIIT5K", + topic="text_recognition"), ) def main(args):