added detection_collate docstring and refactored AnnotationTransform

ellisbrown · ellisbrown · commit 5293f4741d71 · 2017-03-11T21:32:07.000-06:00
diff --git a/torchvision/datasets/voc.py b/torchvision/datasets/voc.py
@@ -84,14 +84,17 @@ class AnnotationTransform(object):
             (default: alphabetic indexing of VOC's 20 classes)
         keep_difficult (bool, optional): keep difficult instances or not
             (default: False)
+        channels (int): number of channels
+        height (int): height
+        width (int): width
     """
 
     def __init__(self, class_to_ind=None, keep_difficult=False):
         self.class_to_ind = class_to_ind or dict(
             zip(VOC_CLASSES, range(len(VOC_CLASSES))))
         self.keep_difficult = keep_difficult
 
-    def __call__(self, target):
+    def __call__(self, target, channels, height, width):
         """
         Arguments:
             target (annotation) : the target annotation to be made usable
@@ -108,13 +111,18 @@ def __call__(self, target):
             bbox = obj.find('bndbox')
 
             # [xmin, ymin, xmax, ymax]
-            bndbox = [int(bb.text) - 1 for bb in bbox]
+            bndbox = []
+            for i, cur_bb in enumerate(bbox):
+                bb_sz = int(cur_bb.text) - 1
+                bb_sz = bb_sz/width if i%2 == 0 else bb_sz/height # scale height or width
+                bndbox.append(bb_sz)
+
             label_ind = self.class_to_ind[name]
             bndbox.append(label_ind)
             res += [bndbox]  # [xmin, ymin, xmax, ymax, ind]
 
         return res  # [[xmin, ymin, xmax, ymax, ind], ... ]
-# torch.Tensor(res)
+
 
 class VOCDetection(data.Dataset):
     """VOC Detection Dataset Object
@@ -169,8 +177,7 @@ def __len__(self):
         return len(self.ids)
 
     def show(self, index, subparts=False):
-        '''Shows an image with its ground truth boxes overlaid
-        optionally
+        '''Shows an image with its ground truth boxes overlaid optionally
 
         Note: not using self.__getitem__(), as any transformations passed in
         could mess up this functionality.
@@ -179,18 +186,19 @@ def show(self, index, subparts=False):
             index (int): index of img to show
             subparts (bool, optional): whether or not to display subpart
             bboxes of ground truths
+                (default: False)
         '''
         img_id = self.ids[index]
         target = ET.parse(self._annopath % img_id).getroot()
         img = Image.open(self._imgpath % img_id).convert('RGB')
         draw = ImageDraw.Draw(img)
         i = 0
         bndboxs = []
-        classes = dict()
+        classes = dict() # maps class name to a class number
         for obj in target.iter('object'):
             bbox = obj.find('bndbox')
             name = obj.find('name').text.lower().strip()
-            if not name in classes:
+            if name not in classes:
                 classes[name] = i
                 i += 1
             bndboxs.append((name, [int(bb.text) - 1 for bb in bbox]))
@@ -199,7 +207,7 @@ def show(self, index, subparts=False):
                     name = part.find('name').text.lower().strip()
                     bbox = part.find('bndbox')
                     bndboxs.append((name, [int(bb.text) - 1 for bb in bbox]))
-                    if not name in classes:
+                    if name not in classes:
                         classes[name] = i
                         i += 1
         for name, bndbox in bndboxs:
@@ -209,14 +217,27 @@ def show(self, index, subparts=False):
         img.show()
         return img
 
+
 def detection_collate(batch):
+    """Custom collate fn for dealing with batches of images that have a different
+    number of associated object annotations (bounding boxes).
+
+    Arguments:
+        batch: (tuple) A tuple of tensor images and lists of annotations
+
+    Return:
+        A tuple containing:
+            1) (tensor) batch of images stacked on their 0 dim
+            2) (list of tensors) annotations for a given image are stacked on 0 dim
+    """
     targets = []
     imgs = []
-    for i,sample in enumerate(batch):
-        for j, tup in enumerate(sample):
+    for _, sample in enumerate(batch):
+        for _, tup in enumerate(sample):
             if torch.is_tensor(tup):
                 imgs.append(tup)
             elif isinstance(tup, type([])):
-                targets.append([torch.Tensor(x) for x in tup])
+                annos = [torch.Tensor(a) for a in tup]
+                targets.append(torch.stack(annos, 0))
 
     return (torch.stack(imgs, 0), targets)