diff --git a/fast_rcnn/README.md b/fast_rcnn/README.md new file mode 100644 index 0000000000..fb1aab88ba --- /dev/null +++ b/fast_rcnn/README.md @@ -0,0 +1,14 @@ +# Faster R-CNN code example + +```python +python main.py PATH_TO_DATASET +``` + +## Things to add/change/consider +* where to handle the image scaling. Need to scale the annotations, and also RPN filters the minimum size wrt the original image size, and not the scaled image +* should image scaling be handled in FasterRCNN class? +* properly supporting flipping +* best way to handle different parameters in RPN/FRCNN for train/eval modes +* uniformize Variables, they should be provided by the user and not processed by FasterRCNN/RPN classes +* general code cleanup, lots of torch/numpy mixture +* should I use a general config file? diff --git a/fast_rcnn/faster_rcnn.py b/fast_rcnn/faster_rcnn.py new file mode 100644 index 0000000000..fbd9434e7d --- /dev/null +++ b/fast_rcnn/faster_rcnn.py @@ -0,0 +1,199 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable +import numpy as np +import numpy.random as npr + +from utils import \ + bbox_transform, bbox_transform_inv, clip_boxes, bbox_overlaps + +from utils import to_var as _tovar + +# should handle multiple scales, how? +class FasterRCNN(nn.Container): + + def __init__(self, + features, pooler, + classifier, rpn, + batch_size=128, fg_fraction=0.25, + fg_threshold=0.5, bg_threshold=None, + num_classes=21): + super(FasterRCNN, self).__init__() + self.features = features + self.roi_pooling = pooler + self.rpn = rpn + self.classifier = classifier + + self.batch_size = batch_size + self.fg_fraction = fg_fraction + self.fg_threshold = fg_threshold + if bg_threshold is None: + bg_threshold = (0, 0.5) + self.bg_threshold = bg_threshold + self._num_classes = num_classes + + # should it support batched images ? + def forward(self, x): + #if self.training is True: + if isinstance(x, tuple): + im, gt = x + else: + im = x + gt = None + + assert im.size(0) == 1, 'only single element batches supported' + + feats = self.features(_tovar(im)) + + roi_boxes, rpn_prob, rpn_loss = self.rpn(im, feats, gt) + + #if self.training is True: + if gt is not None: + # append gt boxes and sample fg / bg boxes + # proposal_target-layer.py + all_rois, frcnn_labels, roi_boxes, frcnn_bbox_targets = self.frcnn_targets(roi_boxes, im, gt) + + # r-cnn + regions = self.roi_pooling(feats, roi_boxes) + scores, bbox_pred = self.classifier(regions) + + boxes = self.bbox_reg(roi_boxes, bbox_pred, im) + + # apply cls + bbox reg loss here + #if self.training is True: + if gt is not None: + frcnn_loss = self.frcnn_loss(scores, bbox_pred, frcnn_labels, frcnn_bbox_targets) + loss = frcnn_loss + rpn_loss + return loss, scores, boxes + + return scores, boxes + + def frcnn_loss(self, scores, bbox_pred, labels, bbox_targets): + cls_crit = nn.CrossEntropyLoss() + cls_loss = cls_crit(scores, labels) + + reg_crit = nn.SmoothL1Loss() + reg_loss = reg_crit(bbox_pred, bbox_targets) + + loss = cls_loss + reg_loss + return loss + + def frcnn_targets(self, all_rois, im, gt): + all_rois = all_rois.data.numpy() + gt_boxes = gt['boxes'].numpy() + gt_labels = np.array(gt['gt_classes']) + #zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) + #all_rois = np.vstack( + # (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) + #) + all_rois = np.vstack((all_rois, gt_boxes)) + zeros = np.zeros((all_rois.shape[0], 1), dtype=all_rois.dtype) + all_rois = np.hstack((zeros, all_rois)) + + num_images = 1 + rois_per_image = self.batch_size / num_images + fg_rois_per_image = np.round(self.fg_fraction * rois_per_image) + + # Sample rois with classification labels and bounding box regression + # targets + labels, rois, bbox_targets = _sample_rois(self, + all_rois, gt_boxes, gt_labels, fg_rois_per_image, + rois_per_image, self._num_classes) + + return _tovar((all_rois, labels, rois, bbox_targets)) + + def bbox_reg(self, boxes, box_deltas, im): + boxes = boxes.data[:,1:].numpy() + box_deltas = box_deltas.data.numpy() + pred_boxes = bbox_transform_inv(boxes, box_deltas) + pred_boxes = clip_boxes(pred_boxes, im.size()[-2:]) + return _tovar(pred_boxes) + +def _get_bbox_regression_labels(bbox_target_data, num_classes): + """Bounding-box regression targets (bbox_target_data) are stored in a + compact form N x (class, tx, ty, tw, th) + This function expands those targets into the 4-of-4*K representation used + by the network (i.e. only one class has non-zero targets). + Returns: + bbox_target (ndarray): N x 4K blob of regression targets + bbox_inside_weights (ndarray): N x 4K blob of loss weights + """ + + clss = bbox_target_data[:, 0] + bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) + bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) + inds = np.where(clss > 0)[0] + for ind in inds: + cls = clss[ind] + start = 4 * cls + end = start + 4 + bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] + return bbox_targets + + +def _compute_targets(ex_rois, gt_rois, labels): + """Compute bounding-box regression targets for an image.""" + + assert ex_rois.shape[0] == gt_rois.shape[0] + assert ex_rois.shape[1] == 4 + assert gt_rois.shape[1] == 4 + + targets = bbox_transform(ex_rois, gt_rois) + if False: #cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: + # Optionally normalize targets by a precomputed mean and stdev + targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) + / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) + return np.hstack( + (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) + +def _sample_rois(self, all_rois, gt_boxes, gt_labels, fg_rois_per_image, rois_per_image, num_classes): + """Generate a random sample of RoIs comprising foreground and background + examples. + """ + # overlaps: (rois x gt_boxes) + overlaps = bbox_overlaps( + np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), + np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) + overlaps = overlaps.numpy() + gt_assignment = overlaps.argmax(axis=1) + max_overlaps = overlaps.max(axis=1) + #labels = gt_boxes[gt_assignment, 4] + labels = gt_labels[gt_assignment] + + # Select foreground RoIs as those with >= FG_THRESH overlap + fg_inds = np.where(max_overlaps >= self.fg_threshold)[0] + # Guard against the case when an image has fewer than fg_rois_per_image + # foreground RoIs + fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) + # Sample foreground regions without replacement + if fg_inds.size > 0: + fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) + + # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) + bg_inds = np.where((max_overlaps < self.bg_threshold[1]) & + (max_overlaps >= self.bg_threshold[0]))[0] + # Compute number of background RoIs to take from this image (guarding + # against there being fewer than desired) + bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image + bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) + # Sample background regions without replacement + if bg_inds.size > 0: + bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) + + # The indices that we're selecting (both fg and bg) + keep_inds = np.append(fg_inds, bg_inds) + # Select sampled values from various arrays: + labels = labels[keep_inds] + # Clamp labels for the background RoIs to 0 + labels[fg_rois_per_this_image:] = 0 + rois = all_rois[keep_inds] + + bbox_target_data = _compute_targets( + rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) + + bbox_targets = \ + _get_bbox_regression_labels(bbox_target_data, num_classes) + + return labels, rois, bbox_targets + + diff --git a/fast_rcnn/generate_anchors.py b/fast_rcnn/generate_anchors.py new file mode 100644 index 0000000000..1125a801fe --- /dev/null +++ b/fast_rcnn/generate_anchors.py @@ -0,0 +1,105 @@ +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick and Sean Bell +# -------------------------------------------------------- + +import numpy as np + +# Verify that we compute the same anchors as Shaoqing's matlab implementation: +# +# >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat +# >> anchors +# +# anchors = +# +# -83 -39 100 56 +# -175 -87 192 104 +# -359 -183 376 200 +# -55 -55 72 72 +# -119 -119 136 136 +# -247 -247 264 264 +# -35 -79 52 96 +# -79 -167 96 184 +# -167 -343 184 360 + +#array([[ -83., -39., 100., 56.], +# [-175., -87., 192., 104.], +# [-359., -183., 376., 200.], +# [ -55., -55., 72., 72.], +# [-119., -119., 136., 136.], +# [-247., -247., 264., 264.], +# [ -35., -79., 52., 96.], +# [ -79., -167., 96., 184.], +# [-167., -343., 184., 360.]]) + +def generate_anchors(base_size=16, ratios=[0.5, 1, 2], + scales=2**np.arange(3, 6)): + """ + Generate anchor (reference) windows by enumerating aspect ratios X + scales wrt a reference (0, 0, 15, 15) window. + """ + + base_anchor = np.array([1, 1, base_size, base_size]) - 1 + ratio_anchors = _ratio_enum(base_anchor, ratios) + anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) + for i in xrange(ratio_anchors.shape[0])]) + return anchors + +def _whctrs(anchor): + """ + Return width, height, x center, and y center for an anchor (window). + """ + + w = anchor[2] - anchor[0] + 1 + h = anchor[3] - anchor[1] + 1 + x_ctr = anchor[0] + 0.5 * (w - 1) + y_ctr = anchor[1] + 0.5 * (h - 1) + return w, h, x_ctr, y_ctr + +def _mkanchors(ws, hs, x_ctr, y_ctr): + """ + Given a vector of widths (ws) and heights (hs) around a center + (x_ctr, y_ctr), output a set of anchors (windows). + """ + + ws = ws[:, np.newaxis] + hs = hs[:, np.newaxis] + anchors = np.hstack((x_ctr - 0.5 * (ws - 1), + y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), + y_ctr + 0.5 * (hs - 1))) + return anchors + +def _ratio_enum(anchor, ratios): + """ + Enumerate a set of anchors for each aspect ratio wrt an anchor. + """ + + w, h, x_ctr, y_ctr = _whctrs(anchor) + size = w * h + size_ratios = size / ratios + ws = np.round(np.sqrt(size_ratios)) + hs = np.round(ws * ratios) + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + +def _scale_enum(anchor, scales): + """ + Enumerate a set of anchors for each scale wrt an anchor. + """ + + w, h, x_ctr, y_ctr = _whctrs(anchor) + ws = w * scales + hs = h * scales + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + +if __name__ == '__main__': + import time + t = time.time() + a = generate_anchors() + print time.time() - t + print a + from IPython import embed; embed() diff --git a/fast_rcnn/main.py b/fast_rcnn/main.py new file mode 100644 index 0000000000..c1301e74aa --- /dev/null +++ b/fast_rcnn/main.py @@ -0,0 +1,172 @@ +import argparse +import time +#from copy import deepcopy + +import torch +import torch.nn as nn +import torch.utils.data +import torchvision.transforms as transforms + +import torch.optim as optim + +from voc import VOCDetection, TransformVOCDetectionAnnotation + +import importlib + +#from model import model + +from tqdm import tqdm + +parser = argparse.ArgumentParser(description='PyTorch Faster R-CNN Training') +parser.add_argument('data', metavar='DIR', + help='path to dataset') +parser.add_argument('--model', '-m', metavar='MODEL', default='model', + help='file containing model definition ' + '(default: model)') +parser.add_argument('--lr', '--learning-rate', default=0.01, type=float, + metavar='LR', help='initial learning rate') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)') +parser.add_argument('--print-freq', '-p', default=10, type=int, + metavar='N', help='print frequency (default: 10)') + +cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') +class_to_ind = dict(zip(cls, range(len(cls)))) + +args = parser.parse_args() +model = importlib.import_module(args.model).model() +model_test = importlib.import_module(args.model).model() +model_test.load_state_dict(model.state_dict()) + +train_data = VOCDetection(args.data, 'train', + transform=transforms.ToTensor(), + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + +val_data = VOCDetection(args.data, 'val', + transform=transforms.ToTensor(), + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + +def collate_fn(batch): + imgs, gt = zip(*batch) + return imgs[0].unsqueeze(0), gt[0] + +train_loader = torch.utils.data.DataLoader( + train_data, batch_size=1, shuffle=True, + num_workers=0, collate_fn=collate_fn) + + +val_loader = torch.utils.data.DataLoader( + val_data, batch_size=1, shuffle=False, + num_workers=0, collate_fn=collate_fn) + +optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + +def train(train_loader, model, optimizer, epoch): + batch_time = AverageMeter() + data_time = AverageMeter() + losses = AverageMeter() + + model.train() + end = time.time() + for i, (im, gt) in (enumerate(train_loader)): + adjust_learning_rate(optimizer, epoch) + + # measure data loading time + data_time.update(time.time() - end) + + optimizer.zero_grad() + loss, scores, boxes = model((im, gt)) + loss.backward() + optimizer.step() + + losses.update(loss.data[0], im.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + if i % args.print_freq == 0: + print('Epoch: [{0}][{1}/{2}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + #'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' + #'Prec@5 {top5.val:.3f} ({top5.avg:.3f})' + .format( + epoch, i, len(train_loader), batch_time=batch_time, + data_time=data_time, loss=losses, + #top1=top1, top5=top5 + )) + #global model_test + #assert model.state_dict() == model_test.state_dict() + +def validate(val_loader, model): + batch_time = AverageMeter() + losses = AverageMeter() + + # switch to evaluate mode + model.eval() + + end = time.time() + + for i, (im, gt) in enumerate(val_loader): + loss, scores, boxes = model((im, gt)) + losses.update(loss.data[0], im.size(0)) + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + print('Test: [{0}/{1}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + #'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + #'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' + #'Prec@5 {top5.val:.3f} ({top5.avg:.3f})' + .format( + i, len(val_loader), batch_time=batch_time, + #data_time=data_time, + loss=losses, + #top1=top1, top5=top5 + )) + + +def adjust_learning_rate(optimizer, epoch): + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + lr = args.lr * (0.1 ** (epoch // 30)) + for param_group in optimizer.state_dict()['param_groups']: + param_group['lr'] = lr + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + +for epoch in range(0, 10): + train(train_loader, model, optimizer, epoch) + #validate(val_loader, model) + +#from IPython import embed; embed() + +#if __name__ == '__main__': +# main() diff --git a/fast_rcnn/model.py b/fast_rcnn/model.py new file mode 100644 index 0000000000..88cd1f95e0 --- /dev/null +++ b/fast_rcnn/model.py @@ -0,0 +1,51 @@ +import torch.nn as nn +from roi_pooling import roi_pooling as _roi_pooling + +from rpn import RPN as _RPN +from faster_rcnn import FasterRCNN as _FasterRCNN + +class _Features(nn.Container): + def __init__(self): + super(_Features, self).__init__() + self.m = nn.Conv2d(3, 3, 3, 16, 1) + + def forward(self, x): + return self.m(x) + +class _Classifier(nn.Container): + def __init__(self): + super(_Classifier, self).__init__() + self.m1 = nn.Linear(3*7*7, 21) + self.m2 = nn.Linear(3*7*7, 21*4) + + def forward(self, x): + return self.m1(x), self.m2(x) + +def _pooler(x, rois): + x = _roi_pooling(x, rois, size=(7,7), spatial_scale=1.0/16.0) + return x.view(x.size(0), -1) + +class _RPNClassifier(nn.Container): + def __init__(self, n): + super(_RPNClassifier, self).__init__() + self.m1 = nn.Conv2d(n, 18, 3, 1, 1) + self.m2 = nn.Conv2d(n, 36, 3, 1, 1) + + def forward(self, x): + return self.m1(x), self.m2(x) + +def model(): + _features = _Features() + _classifier = _Classifier() + _rpn_classifier = _RPNClassifier(3) + + _rpn = _RPN( + classifier=_rpn_classifier + ) + _model = _FasterRCNN( + features=_features, + pooler=_pooler, + classifier=_classifier, + rpn=_rpn + ) + return _model diff --git a/fast_rcnn/py_cpu_nms.py b/fast_rcnn/py_cpu_nms.py new file mode 100644 index 0000000000..54e7b25fef --- /dev/null +++ b/fast_rcnn/py_cpu_nms.py @@ -0,0 +1,38 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import numpy as np + +def py_cpu_nms(dets, thresh): + """Pure Python NMS baseline.""" + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return keep diff --git a/fast_rcnn/roi_pooling.py b/fast_rcnn/roi_pooling.py new file mode 100644 index 0000000000..885e3a734a --- /dev/null +++ b/fast_rcnn/roi_pooling.py @@ -0,0 +1,65 @@ +import torch +import torch.nn as nn +import torch.autograd as ag +import math + +from torch.autograd.function import Function +from torch._thnn import type2backend + +class AdaptiveMaxPool2d(Function): + def __init__(self, out_w, out_h): + super(AdaptiveMaxPool2d, self).__init__() + self.out_w = out_w + self.out_h = out_h + + def forward(self, input): + output = input.new() + indices = input.new().long() + self.save_for_backward(input) + self.indices = indices + self._backend = type2backend[type(input)] + self._backend.SpatialAdaptiveMaxPooling_updateOutput( + self._backend.library_state, input, output, indices, + self.out_w, self.out_h) + return output + + def backward(self, grad_output): + input, = self.saved_tensors + indices = self.indices + grad_input = grad_output.new() + self._backend.SpatialAdaptiveMaxPooling_updateGradInput( + self._backend.library_state, input, grad_output, grad_input, + indices) + return grad_input, None + +def adaptive_max_pool(input, size): + return AdaptiveMaxPool2d(size[0],size[1])(input) + +def roi_pooling(input, rois, size=(7,7), spatial_scale=1.0): + assert(rois.dim() == 2) + assert(rois.size(1) == 5) + output = [] + rois = rois.data.float() + num_rois = rois.size(0) + + rois[:,1:].mul_(spatial_scale) + rois = rois.long() + for i in range(num_rois): + roi = rois[i] + im_idx = roi[0] + im = input.narrow(0, im_idx, 1)[..., roi[2]:(roi[4]+1), roi[1]:(roi[3]+1)] + output.append(adaptive_max_pool(im, size)) + + return torch.cat(output, 0) + +if __name__ == '__main__': + input = ag.Variable(torch.rand(1,1,10,10), requires_grad=True) + rois = ag.Variable(torch.LongTensor([[0,1,2,7,8],[0,3,3,8,8]]),requires_grad=False) + #rois = ag.Variable(torch.LongTensor([[0,3,3,8,8]]),requires_grad=False) + + out = adaptive_max_pool(input,(3,3)) + out.backward(out.data.clone().uniform_()) + + out = roi_pooling(input, rois, size=(3,3)) + out.backward(out.data.clone().uniform_()) + diff --git a/fast_rcnn/rpn.py b/fast_rcnn/rpn.py new file mode 100644 index 0000000000..b7421a2fc1 --- /dev/null +++ b/fast_rcnn/rpn.py @@ -0,0 +1,322 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable +import numpy as np +import numpy.random as npr + +# clean up environment +from utils import bbox_transform, bbox_transform_inv, clip_boxes, filter_boxes, bbox_overlaps +from generate_anchors import generate_anchors + +from utils import to_var as _tovar + +from py_cpu_nms import py_cpu_nms as nms + +class RPN(nn.Container): + + def __init__(self, + classifier, anchor_scales=None, + negative_overlap=0.3, positive_overlap=0.7, + fg_fraction=0.5, batch_size=256, + nms_thresh=0.7, min_size=16, + pre_nms_topN=12000, post_nms_topN=2000 + ): + super(RPN, self).__init__() + + self.rpn_classifier = classifier + + if anchor_scales is None: + anchor_scales = (8, 16, 32) + self._anchors = generate_anchors(scales=np.array(anchor_scales)) + self._num_anchors = self._anchors.shape[0] + + self.negative_overlap = negative_overlap + self.positive_overlap = positive_overlap + self.fg_fraction = fg_fraction + self.batch_size = batch_size + + # used for both train and test + self.nms_thresh = nms_thresh + self.pre_nms_topN = pre_nms_topN + self.post_nms_topN = post_nms_topN + self.min_size = min_size + + + # output rpn probs as well + def forward(self, im, feats, gt=None): + assert im.size(0) == 1, 'only single element batches supported' + # improve + # it is used in get_anchors and also present in roi_pooling + self._feat_stride = round(im.size(3)/feats.size(3)) + # rpn + # put in a separate function + rpn_map, rpn_bbox_pred = self.rpn_classifier(feats) + all_anchors = self.rpn_get_anchors(feats) + rpn_loss = None + #if self.training is True: + if gt is not None: + assert gt is not None + rpn_labels, rpn_bbox_targets = self.rpn_targets(all_anchors, im, gt) + # need to subsample boxes here + rpn_loss = self.rpn_loss(rpn_map, rpn_bbox_pred, rpn_labels, rpn_bbox_targets) + + # roi proposal + # clip, sort, pre nms topk, nms, after nms topk + # params are different for train and test + # proposal_layer.py + roi_boxes, scores = self.get_roi_boxes(all_anchors, rpn_map, rpn_bbox_pred, im) + # only for visualization + if False: + roi_boxes = all_anchors + return _tovar((roi_boxes, scores, rpn_loss, rpn_labels)) + + return _tovar((roi_boxes, scores, rpn_loss)) + + + # from faster rcnn py + def rpn_get_anchors(self, im): + height, width = im.size()[-2:] + # 1. Generate proposals from bbox deltas and shifted anchors + shift_x = np.arange(0, width) * self._feat_stride + shift_y = np.arange(0, height) * self._feat_stride + shift_x, shift_y = np.meshgrid(shift_x, shift_y) + shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), + shift_x.ravel(), shift_y.ravel())).transpose() + # add A anchors (1, A, 4) to + # cell K shifts (K, 1, 4) to get + # shift anchors (K, A, 4) + # reshape to (K*A, 4) shifted anchors + A = self._num_anchors + K = shifts.shape[0] + all_anchors = (self._anchors.reshape((1, A, 4)) + + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) + all_anchors = all_anchors.reshape((K * A, 4)) + return all_anchors + + # restructure because we don't want -1 in labels + # shouldn't we instead keep only the bboxes for which labels >= 0? + def rpn_targets(self, all_anchors, im, gt): + total_anchors = all_anchors.shape[0] + gt_boxes = gt['boxes'] + + height, width = im.size()[-2:] + # only keep anchors inside the image + _allowed_border = 0 + inds_inside = np.where( + (all_anchors[:, 0] >= -_allowed_border) & + (all_anchors[:, 1] >= -_allowed_border) & + (all_anchors[:, 2] < width + _allowed_border) & # width + (all_anchors[:, 3] < height + _allowed_border) # height + )[0] + + # keep only inside anchors + anchors = all_anchors[inds_inside, :] + assert anchors.shape[0] > 0, '{0}x{1} -> {2}'.format(height,width,total_anchors) + + # label: 1 is positive, 0 is negative, -1 is dont care + labels = np.empty((len(inds_inside), ), dtype=np.float32) + labels.fill(-1) + + # overlaps between the anchors and the gt boxes + # overlaps (ex, gt) + #overlaps = bbox_overlaps(anchors, gt_boxes)#.numpy() + overlaps = bbox_overlaps(torch.from_numpy(anchors), gt_boxes).numpy() + gt_boxes = gt_boxes.numpy() + argmax_overlaps = overlaps.argmax(axis=1) + max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] + gt_argmax_overlaps = overlaps.argmax(axis=0) + gt_max_overlaps = overlaps[gt_argmax_overlaps, + np.arange(overlaps.shape[1])] + gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] + + # assign bg labels first so that positive labels can clobber them + labels[max_overlaps < self.negative_overlap] = 0 + + # fg label: for each gt, anchor with highest overlap + labels[gt_argmax_overlaps] = 1 + + # fg label: above threshold IOU + labels[max_overlaps >= self.positive_overlap] = 1 + + # subsample positive labels if we have too many + num_fg = int(self.fg_fraction * self.batch_size) + fg_inds = np.where(labels == 1)[0] + if len(fg_inds) > num_fg: + disable_inds = npr.choice( + fg_inds, size=(len(fg_inds) - num_fg), replace=False) + labels[disable_inds] = -1 + + # subsample negative labels if we have too many + num_bg = self.batch_size - np.sum(labels == 1) + bg_inds = np.where(labels == 0)[0] + if len(bg_inds) > num_bg: + disable_inds = npr.choice( + bg_inds, size=(len(bg_inds) - num_bg), replace=False) + labels[disable_inds] = -1 + + #bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) + #bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) + bbox_targets = bbox_transform(anchors, gt_boxes[argmax_overlaps, :]) + + # map up to original set of anchors + labels = _unmap(labels, total_anchors, inds_inside, fill=-1) + bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) + + return labels, bbox_targets + + # I need to know the original image size (or have the scaling factor) + def get_roi_boxes(self, anchors, rpn_map, rpn_bbox_deltas, im): + # TODO fix this!!! + im_info = (100, 100, 1) + + bbox_deltas = rpn_bbox_deltas.data.numpy() + bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) + + # the first set of _num_anchors channels are bg probs + # the second set are the fg probs, which we want + #scores = bottom[0].data[:, self._num_anchors:, :, :] + scores = rpn_map.data[:, self._num_anchors:, :, :].numpy() + scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) + + # Convert anchors into proposals via bbox transformations + proposals = bbox_transform_inv(anchors, bbox_deltas) + + # 2. clip predicted boxes to image + proposals = clip_boxes(proposals, im.size()[-2:]) + + # 3. remove predicted boxes with either height or width < threshold + # (NOTE: convert min_size to input image scale stored in im_info[2]) + keep = filter_boxes(proposals, self.min_size * im_info[2]) + proposals = proposals[keep, :] + scores = scores[keep] + + # 4. sort all (proposal, score) pairs by score from highest to lowest + # 5. take top pre_nms_topN (e.g. 6000) + order = scores.ravel().argsort()[::-1] + if self.pre_nms_topN > 0: + order = order[:self.pre_nms_topN] + proposals = proposals[order, :] + scores = scores[order] + + # 6. apply nms (e.g. threshold = 0.7) + # 7. take after_nms_topN (e.g. 300) + # 8. return the top proposals (-> RoIs top) + keep = nms(np.hstack((proposals, scores)), self.nms_thresh) + if self.post_nms_topN > 0: + keep = keep[:self.post_nms_topN] + proposals = proposals[keep, :] + scores = scores[keep] + + return proposals, scores + + def rpn_loss(self, rpn_map, rpn_bbox_transform, rpn_labels, rpn_bbox_targets): + height, width = rpn_map.size()[-2:] + + rpn_map = rpn_map.view(-1, 2, height, width).permute(0,2,3,1).contiguous().view(-1, 2) + labels = torch.from_numpy(rpn_labels).long() # convert properly + labels = labels.view(1, height, width, -1).permute(0, 3, 1, 2).contiguous() + labels = labels.view(-1) + + idx = labels.ge(0).nonzero()[:,0] + rpn_map = rpn_map.index_select(0, Variable(idx, requires_grad=False)) + labels = labels.index_select(0, idx) + labels = Variable(labels, requires_grad=False) + + rpn_bbox_targets = torch.from_numpy(rpn_bbox_targets) + rpn_bbox_targets = rpn_bbox_targets.view(1, height, width, -1).permute(0, 3, 1, 2) + rpn_bbox_targets = Variable(rpn_bbox_targets, requires_grad=False) + + cls_crit = nn.CrossEntropyLoss() + reg_crit = nn.SmoothL1Loss() + cls_loss = cls_crit(rpn_map, labels) + # verify normalization and sigma + reg_loss = reg_crit(rpn_bbox_transform, rpn_bbox_targets) + + loss = cls_loss + reg_loss + return loss + +def _unmap(data, count, inds, fill=0): + """ Unmap a subset of item (data) back to the original set of items (of + size count) """ + if len(data.shape) == 1: + ret = np.empty((count, ), dtype=np.float32) + ret.fill(fill) + ret[inds] = data + else: + ret = np.empty((count, ) + data.shape[1:], dtype=np.float32) + ret.fill(fill) + ret[inds, :] = data + return ret + + +def show(img, boxes, label): + from PIL import Image, ImageDraw + import torchvision.transforms as transforms + #img, target = self.__getitem__(index) + img = transforms.ToPILImage()(img) + draw = ImageDraw.Draw(img) + for obj, t in zip(boxes, label): + #print(type(t)) + if t == 1: + #print(t) + draw.rectangle(obj[0:4].tolist(), outline=(255,0,0)) + #draw.text(obj[0:2].tolist(), cls[t], fill=(0,255,0)) + #else: + elif t == 0: + #pass + draw.rectangle(obj[0:4].tolist(), outline=(0,0,255)) + img.show() + + + +if __name__ == '__main__': + import torch + from voc import VOCDetection, TransformVOCDetectionAnnotation + import torchvision.transforms as transforms + + class RPNClassifier(nn.Container): + def __init__(self, n): + super(RPNClassifier, self).__init__() + self.m1 = nn.Conv2d(n, 18, 3, 1, 1) + self.m2 = nn.Conv2d(n, 36, 3, 1, 1) + + def forward(self, x): + return self.m1(x), self.m2(x) + + + rpn = RPN(RPNClassifier(3)) + cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') + class_to_ind = dict(zip(cls, range(len(cls)))) + + + train = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + transform=transforms.ToTensor(), + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + + im, gt = train[100] + im0 = im + + im = im.unsqueeze(0) + + feats = Variable(torch.rand(1,3,im.size(2)/16, im.size(3)/16)) + print(feats.size()) + print(im.size()) + + #rpn.eval() + rpn.train() + import time + t = time.time() + #boxes, scores, loss, labels = rpn(im, feats, gt) + boxes, scores, loss = rpn(im, feats, gt) + print time.time() - t + print loss + loss.backward() + + show(im0, boxes.data, labels.data.int().tolist()) + + #from IPython import embed; embed() diff --git a/fast_rcnn/utils.py b/fast_rcnn/utils.py new file mode 100644 index 0000000000..a443cd1153 --- /dev/null +++ b/fast_rcnn/utils.py @@ -0,0 +1,135 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import torch +from torch.autograd import Variable +import numpy as np + +def bbox_transform(ex_rois, gt_rois): + ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 + ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 + ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths + ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights + + gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 + gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 + gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths + gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights + + targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights + targets_dw = np.log(gt_widths / ex_widths) + targets_dh = np.log(gt_heights / ex_heights) + + targets = np.vstack( + (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() + return targets + +def bbox_transform_inv(boxes, deltas): + if boxes.shape[0] == 0: + return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) + + boxes = boxes.astype(deltas.dtype, copy=False) + + widths = boxes[:, 2] - boxes[:, 0] + 1.0 + heights = boxes[:, 3] - boxes[:, 1] + 1.0 + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + dx = deltas[:, 0::4] + dy = deltas[:, 1::4] + dw = deltas[:, 2::4] + dh = deltas[:, 3::4] + + pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] + pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] + pred_w = np.exp(dw) * widths[:, np.newaxis] + pred_h = np.exp(dh) * heights[:, np.newaxis] + + pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) + # x1 + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w + # y1 + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h + # x2 + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w + # y2 + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h + + return pred_boxes + +def clip_boxes(boxes, im_shape): + """ + Clip boxes to image boundaries. + """ + + # x1 >= 0 + boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) + # y1 >= 0 + boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) + # x2 < im_shape[1] + boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) + # y2 < im_shape[0] + boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) + return boxes + +def filter_boxes(boxes, min_size): + """Remove all boxes with any side smaller than min_size.""" + ws = boxes[:, 2] - boxes[:, 0] + 1 + hs = boxes[:, 3] - boxes[:, 1] + 1 + keep = np.where((ws >= min_size) & (hs >= min_size))[0] + return keep + + +# torch tensors +def bbox_overlaps(a, bb): + if isinstance(a, np.ndarray): + a = torch.from_numpy(a) + if isinstance(bb, np.ndarray): + bb = torch.from_numpy(bb) + + oo = [] + + for b in bb: + + x1 = a.select(1,0).clone() + x1[x1.lt(b[0])] = b[0] + y1 = a.select(1,1).clone() + y1[y1.lt(b[1])] = b[1] + x2 = a.select(1,2).clone() + x2[x2.gt(b[2])] = b[2] + y2 = a.select(1,3).clone() + y2[y2.gt(b[3])] = b[3] + + w = x2-x1+1 + h = y2-y1+1 + inter = torch.mul(w,h).float() + aarea = torch.mul((a.select(1,2)-a.select(1,0)+1), (a.select(1,3)-a.select(1,1)+1)).float() + barea = (b[2]-b[0]+1) * (b[3]-b[1]+1) + + # intersection over union overlap + o = torch.div(inter , (aarea+barea-inter)) + # set invalid entries to 0 overlap + o[w.lt(0)] = 0 + o[h.lt(0)] = 0 + + oo += [o] + + return torch.cat([o.view(-1,1) for o in oo],1) + +def to_var(x): + if isinstance(x, np.ndarray): + return Variable(torch.from_numpy(x), requires_grad=False) + elif torch.is_tensor(x): + return Variable(x, requires_grad=True) + elif isinstance(x, tuple): + t = [] + for i in x: + t.append(to_var(i)) + return t + elif isinstance(x, Variable): + return x diff --git a/fast_rcnn/voc.py b/fast_rcnn/voc.py new file mode 100644 index 0000000000..1eb0b0e0ed --- /dev/null +++ b/fast_rcnn/voc.py @@ -0,0 +1,144 @@ +import torch +import torch.utils.data as data +from PIL import Image, ImageDraw +import os +import os.path +import sys +if sys.version_info[0] == 2: + import xml.etree.cElementTree as ET +else: + import xml.etree.ElementTree as ET + +def _flip_box(boxes, width): + boxes = boxes.clone() + oldx1 = boxes[:, 0].clone() + oldx2 = boxes[:, 2].clone() + boxes[:, 0] = width - oldx2 - 1 + boxes[:, 2] = width - oldx1 - 1 + return boxes + +class TransformVOCDetectionAnnotation(object): + def __init__(self, class_to_ind, keep_difficult=False): + self.keep_difficult = keep_difficult + self.class_to_ind = class_to_ind + + def __call__(self, target): + boxes = [] + gt_classes = [] + for obj in target.iter('object'): + difficult = int(obj.find('difficult').text) == 1 + if not self.keep_difficult and difficult: + continue + name = obj.find('name').text.lower().strip() + bb = obj.find('bndbox') + bndbox = map(int, [bb.find('xmin').text, bb.find('ymin').text, + bb.find('xmax').text, bb.find('ymax').text]) + + boxes += [bndbox] + gt_classes += [self.class_to_ind[name]] + + size = target.find('size') + im_info = map(int,(size.find('height').text, size.find('width').text, 1)) + + res = { + 'boxes': torch.LongTensor(boxes), + 'gt_classes':gt_classes, + 'im_info': im_info + } + return res + +class VOCSegmentation(data.Dataset): + def __init__(self, root, image_set, transform=None, target_transform=None): + self.root = root + self.image_set = image_set + self.transform = transform + self.target_transform = target_transform + + dataset_name = 'VOC2007' + self._annopath = os.path.join(self.root, dataset_name, 'SegmentationClass', '%s.png') + self._imgpath = os.path.join(self.root, dataset_name, 'JPEGImages', '%s.jpg') + self._imgsetpath = os.path.join(self.root, dataset_name, 'ImageSets', 'Segmentation', '%s.txt') + + with open(self._imgsetpath % self.image_set) as f: + self.ids = f.readlines() + self.ids = [x.strip('\n') for x in self.ids] + + def __getitem__(self, index): + img_id = self.ids[index] + + target = Image.open(self._annopath % img_id)#.convert('RGB') + + img = Image.open(self._imgpath % img_id).convert('RGB') + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.ids) + + +class VOCDetection(data.Dataset): + def __init__(self, root, image_set, transform=None, target_transform=None): + self.root = root + self.image_set = image_set + self.transform = transform + self.target_transform = target_transform + + dataset_name = 'VOC2007' + self._annopath = os.path.join(self.root, dataset_name, 'Annotations', '%s.xml') + self._imgpath = os.path.join(self.root, dataset_name, 'JPEGImages', '%s.jpg') + self._imgsetpath = os.path.join(self.root, dataset_name, 'ImageSets', 'Main', '%s.txt') + + with open(self._imgsetpath % self.image_set) as f: + self.ids = f.readlines() + self.ids = [x.strip('\n') for x in self.ids] + + def __getitem__(self, index): + img_id = self.ids[index] + + target = ET.parse(self._annopath % img_id).getroot() + + img = Image.open(self._imgpath % img_id).convert('RGB') + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.ids) + + def show(self, index): + img, target = self.__getitem__(index) + draw = ImageDraw.Draw(img) + for obj in target: + draw.rectangle(obj[0:4], outline=(255,0,0)) + draw.text(obj[0:2], obj[4], fill=(0,255,0)) + img.show() + +if __name__ == '__main__': + cls = ('__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor') + class_to_ind = dict(zip(cls, range(len(cls)))) + + ds = VOCDetection('/home/francisco/work/datasets/VOCdevkit/', 'train', + target_transform=TransformVOCDetectionAnnotation(class_to_ind, False)) + print(len(ds)) + img, target = ds[0] + print(target) + #ds.show(1) + #dss = VOCSegmentation('/home/francisco/work/datasets/VOCdevkit/', 'train') + #img, target = dss[0] + + #img.show() + #print(target_transform(target))