yolox/evaluators/voc_evaluator.py

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.

import sys
import tempfile
import time
from loguru import logger
from tqdm import tqdm

import numpy as np

import megengine as mge
import megengine.distributed as dist
import megengine.functional as F

from yolox.utils import gather_pyobj, postprocess, time_synchronized


class VOCEvaluator:
    """
    VOC AP Evaluation class.
    """

    def __init__(
        self, dataloader, img_size, confthre, nmsthre, num_classes,
    ):
        """
        Args:
            dataloader (Dataloader): evaluate dataloader.
            img_size (int): image size after preprocess. images are resized
                to squares whose shape is (img_size, img_size).
            confthre (float): confidence threshold ranging from 0 to 1, which
                is defined in the config file.
            nmsthre (float): IoU threshold of non-max supression ranging from 0 to 1.
        """
        self.dataloader = dataloader
        self.img_size = img_size
        self.confthre = confthre
        self.nmsthre = nmsthre
        self.num_classes = num_classes
        self.num_images = len(dataloader.dataset)
        self.is_main_process = dist.get_rank() == 0

    def evaluate(
        self, model, distributed=False, decoder=None, test_size=None
    ):
        """
        VOC average precision (AP) Evaluation. Iterate inference on the test dataset
        and the results are evaluated by COCO API.

        NOTE: This function will change training mode to False, please save states if needed.

        Args:
            model : model to evaluate.

        Returns:
            ap50_95 (float) : COCO style AP of IoU=50:95
            ap50 (float) : VOC 2007 metric AP of IoU=50
            summary (sr): summary info of evaluation.
        """
        # TODO half to amp_test
        model.eval()
        ids = []
        data_list = {}
        progress_bar = tqdm if self.is_main_process else iter

        inference_time = 0
        nms_time = 0
        n_samples = len(self.dataloader) - 1

        for cur_iter, (imgs, _, info_imgs, ids) in enumerate(progress_bar(self.dataloader)):
            # skip the the last iters since batchsize might be not enough for batch inference
            is_time_record = cur_iter < len(self.dataloader) - 1
            if is_time_record:
                start = time.time()

            outputs = model(imgs)
            if decoder is not None:
                outputs = decoder(outputs, dtype=outputs.type())

            if is_time_record:
                infer_end = time_synchronized()
                inference_time += infer_end - start

            outputs = postprocess(
                outputs, self.num_classes, self.confthre, self.nmsthre
            )
            if is_time_record:
                nms_end = time_synchronized()
                nms_time += nms_end - infer_end

            data_list.update(self.convert_to_voc_format(outputs, info_imgs, ids))

        statistics = mge.tensor([inference_time, nms_time, n_samples]).astype("float32")
        if distributed:
            statistics = F.distributed.all_reduce_sum(statistics)
            statistics /= dist.get_world_size()
            results = gather_pyobj(data_list, obj_name="data_list", target_rank_id=0)
            for x in results[1:]:
                data_list.extend(x)

        eval_results = self.evaluate_prediction(data_list, statistics)
        dist.group_barrier()
        return eval_results

    def convert_to_voc_format(self, outputs, info_imgs, ids):
        predictions = {}
        for (output, img_h, img_w, img_id) in zip(outputs, info_imgs[0], info_imgs[1], ids):
            if output is None:
                predictions[int(img_id)] = (None, None, None)
                continue
            output = output.cpu()

            bboxes = output[:, 0:4]

            # preprocessing: resize
            scale = min(self.img_size[0] / float(img_h), self.img_size[1] / float(img_w))
            bboxes /= scale

            cls = output[:, 6]
            scores = output[:, 4] * output[:, 5]

            predictions[int(img_id)] = (bboxes, cls, scores)
        return predictions

    def evaluate_prediction(self, data_dict, statistics):
        if not self.is_main_process:
            return 0, 0, None

        logger.info("Evaluate in main process...")

        inference_time = statistics[0].item()
        nms_time = statistics[1].item()
        n_samples = statistics[2].item()

        a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size)
        a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size)

        time_info = ", ".join(
            ["Average {} time: {:.2f} ms".format(k, v) for k, v in zip(
                ["forward", "NMS", "inference"],
                [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)]
            )]
        )

        info = time_info + "\n"

        all_boxes = [[[] for _ in range(self.num_images)] for _ in range(self.num_classes)]
        for img_num in range(self.num_images):
            bboxes, cls, scores = data_dict[img_num]
            if bboxes is None:
                for j in range(self.num_classes):
                    all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
                continue
            for j in range(self.num_classes):
                mask_c = cls == j
                if sum(mask_c) == 0:
                    all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
                    continue

                c_dets = F.concat((bboxes, scores.unsqueeze(1)), axis=1)
                all_boxes[j][img_num] = c_dets[mask_c].numpy()

            sys.stdout.write(
                "im_eval: {:d}/{:d} \r".format(img_num + 1, self.num_images)
            )
            sys.stdout.flush()

        with tempfile.TemporaryDirectory() as tempdir:
            mAP50, mAP70 = self.dataloader.dataset.evaluate_detections(all_boxes, tempdir)
            return mAP50, mAP70, info