From 993aa8db39eff5f95a5e1b3b094309aac4eaa1e1 Mon Sep 17 00:00:00 2001 From: zyan3 Date: Wed, 2 Oct 2019 21:50:15 -0700 Subject: [PATCH 1/4] move sampler into TV core. Update UniformClipSampler --- test/test_datasets_samplers.py | 87 +++++++++++++++++++ test/test_datasets_video_utils.py | 30 ------- torchvision/datasets/samplers/__init__.py | 3 + .../datasets/samplers/clip_sampler.py | 28 ++++-- 4 files changed, 110 insertions(+), 38 deletions(-) create mode 100644 test/test_datasets_samplers.py create mode 100644 torchvision/datasets/samplers/__init__.py rename references/video_classification/sampler.py => torchvision/datasets/samplers/clip_sampler.py (81%) diff --git a/test/test_datasets_samplers.py b/test/test_datasets_samplers.py new file mode 100644 index 00000000000..904b547375a --- /dev/null +++ b/test/test_datasets_samplers.py @@ -0,0 +1,87 @@ +import contextlib +import sys +import os +import torch +import unittest + +from torchvision import io +from torchvision.datasets.samplers import RandomClipSampler, UniformClipSampler +from torchvision.datasets.video_utils import VideoClips, unfold +from torchvision import get_video_backend + +from common_utils import get_tmp_dir + + +@contextlib.contextmanager +def get_list_of_videos(num_videos=5, sizes=None, fps=None): + with get_tmp_dir() as tmp_dir: + names = [] + for i in range(num_videos): + if sizes is None: + size = 5 * (i + 1) + else: + size = sizes[i] + if fps is None: + f = 5 + else: + f = fps[i] + data = torch.randint(0, 255, (size, 300, 400, 3), dtype=torch.uint8) + name = os.path.join(tmp_dir, "{}.mp4".format(i)) + names.append(name) + io.write_video(name, data, fps=f) + + yield names + + +class Tester(unittest.TestCase): + def test_random_clip_sampler(self): + with get_list_of_videos(num_videos=3, sizes=[25, 25, 25]) as video_list: + video_clips = VideoClips(video_list, 5, 5) + sampler = RandomClipSampler(video_clips, 3) + self.assertEqual(len(sampler), 3 * 3) + indices = torch.tensor(list(iter(sampler))) + videos = indices // 5 + v_idxs, count = torch.unique(videos, return_counts=True) + self.assertTrue(v_idxs.equal(torch.tensor([0, 1, 2]))) + self.assertTrue(count.equal(torch.tensor([3, 3, 3]))) + + def test_random_clip_sampler_unequal(self): + with get_list_of_videos(num_videos=3, sizes=[10, 25, 25]) as video_list: + video_clips = VideoClips(video_list, 5, 5) + sampler = RandomClipSampler(video_clips, 3) + self.assertEqual(len(sampler), 2 + 3 + 3) + indices = list(iter(sampler)) + self.assertIn(0, indices) + self.assertIn(1, indices) + # remove elements of the first video, to simplify testing + indices.remove(0) + indices.remove(1) + indices = torch.tensor(indices) - 2 + videos = indices // 5 + v_idxs, count = torch.unique(videos, return_counts=True) + self.assertTrue(v_idxs.equal(torch.tensor([0, 1]))) + self.assertTrue(count.equal(torch.tensor([3, 3]))) + + def test_uniform_clip_sampler(self): + with get_list_of_videos(num_videos=3, sizes=[25, 25, 25]) as video_list: + video_clips = VideoClips(video_list, 5, 5) + sampler = UniformClipSampler(video_clips, 3) + self.assertEqual(len(sampler), 3 * 3) + indices = torch.tensor(list(iter(sampler))) + videos = indices // 5 + v_idxs, count = torch.unique(videos, return_counts=True) + self.assertTrue(v_idxs.equal(torch.tensor([0, 1, 2]))) + self.assertTrue(count.equal(torch.tensor([3, 3, 3]))) + self.assertTrue(indices.equal(torch.tensor([0, 2, 4, 5, 7, 9, 10, 12, 14]))) + + def test_uniform_clip_sampler_insufficient_clips(self): + with get_list_of_videos(num_videos=3, sizes=[10, 25, 25]) as video_list: + video_clips = VideoClips(video_list, 5, 5) + sampler = UniformClipSampler(video_clips, 3) + self.assertEqual(len(sampler), 3 * 3) + indices = torch.tensor(list(iter(sampler))) + self.assertTrue(indices.equal(torch.tensor([0, 1, 1, 2, 4, 6, 7, 9, 11]))) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_datasets_video_utils.py b/test/test_datasets_video_utils.py index a9cb7ab50ef..ccca068d367 100644 --- a/test/test_datasets_video_utils.py +++ b/test/test_datasets_video_utils.py @@ -83,36 +83,6 @@ def test_video_clips(self): self.assertEqual(video_idx, v_idx) self.assertEqual(clip_idx, c_idx) - @unittest.skip("Moved to reference scripts for now") - def test_video_sampler(self): - with get_list_of_videos(num_videos=3, sizes=[25, 25, 25]) as video_list: - video_clips = VideoClips(video_list, 5, 5) - sampler = RandomClipSampler(video_clips, 3) # noqa: F821 - self.assertEqual(len(sampler), 3 * 3) - indices = torch.tensor(list(iter(sampler))) - videos = indices // 5 - v_idxs, count = torch.unique(videos, return_counts=True) - self.assertTrue(v_idxs.equal(torch.tensor([0, 1, 2]))) - self.assertTrue(count.equal(torch.tensor([3, 3, 3]))) - - @unittest.skip("Moved to reference scripts for now") - def test_video_sampler_unequal(self): - with get_list_of_videos(num_videos=3, sizes=[10, 25, 25]) as video_list: - video_clips = VideoClips(video_list, 5, 5) - sampler = RandomClipSampler(video_clips, 3) # noqa: F821 - self.assertEqual(len(sampler), 2 + 3 + 3) - indices = list(iter(sampler)) - self.assertIn(0, indices) - self.assertIn(1, indices) - # remove elements of the first video, to simplify testing - indices.remove(0) - indices.remove(1) - indices = torch.tensor(indices) - 2 - videos = indices // 5 - v_idxs, count = torch.unique(videos, return_counts=True) - self.assertTrue(v_idxs.equal(torch.tensor([0, 1]))) - self.assertTrue(count.equal(torch.tensor([3, 3]))) - @unittest.skipIf(not io.video._av_available(), "this test requires av") @unittest.skipIf('win' in sys.platform, 'temporarily disabled on Windows') def test_video_clips_custom_fps(self): diff --git a/torchvision/datasets/samplers/__init__.py b/torchvision/datasets/samplers/__init__.py new file mode 100644 index 00000000000..870322d39b4 --- /dev/null +++ b/torchvision/datasets/samplers/__init__.py @@ -0,0 +1,3 @@ +from .clip_sampler import DistributedSampler, UniformClipSampler, RandomClipSampler + +__all__ = ('DistributedSampler', 'UniformClipSampler', 'RandomClipSampler') diff --git a/references/video_classification/sampler.py b/torchvision/datasets/samplers/clip_sampler.py similarity index 81% rename from references/video_classification/sampler.py rename to torchvision/datasets/samplers/clip_sampler.py index b92dad013c6..07731bb5141 100644 --- a/references/video_classification/sampler.py +++ b/torchvision/datasets/samplers/clip_sampler.py @@ -60,33 +60,45 @@ def set_epoch(self, epoch): class UniformClipSampler(torch.utils.data.Sampler): """ - Samples at most `max_video_clips_per_video` clips for each video, equally spaced + Sample `num_video_clips_per_video` clips for each video, equally spaced. + When number of unique clips in the video is fewer than num_video_clips_per_video, + repeat the clips until `num_video_clips_per_video` clips are collected + Arguments: video_clips (VideoClips): video clips to sample from - max_clips_per_video (int): maximum number of clips to be sampled per video + num_clips_per_video (int): number of clips to be sampled per video """ - def __init__(self, video_clips, max_clips_per_video): + def __init__(self, video_clips, num_clips_per_video): if not isinstance(video_clips, torchvision.datasets.video_utils.VideoClips): raise TypeError("Expected video_clips to be an instance of VideoClips, " "got {}".format(type(video_clips))) self.video_clips = video_clips - self.max_clips_per_video = max_clips_per_video + self.num_clips_per_video = num_clips_per_video def __iter__(self): idxs = [] s = 0 - # select at most max_clips_per_video for each video, uniformly spaced + # select num_clips_per_video for each video, uniformly spaced for c in self.video_clips.clips: length = len(c) - step = max(length // self.max_clips_per_video, 1) - sampled = torch.arange(length)[::step] + s + if length == 0: + # corner case where video decoding fails + continue + + sampled = ( + torch.linspace(s, s + length - 1, steps=self.num_clips_per_video) + .round() + .to(torch.int64) + ) s += length idxs.append(sampled) idxs = torch.cat(idxs).tolist() return iter(idxs) def __len__(self): - return sum(min(len(c), self.max_clips_per_video) for c in self.video_clips.clips) + return sum( + self.num_clips_per_video for c in self.video_clips.clips if len(c) > 0 + ) class RandomClipSampler(torch.utils.data.Sampler): From 3e7a33eddf67daec1c0bd2f086c359626862b085 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Thu, 3 Oct 2019 16:17:27 +0200 Subject: [PATCH 2/4] Fix reference training script --- references/video_classification/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/references/video_classification/train.py b/references/video_classification/train.py index 192babf62dc..74852c2f721 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -11,9 +11,10 @@ import torchvision import torchvision.datasets.video_utils from torchvision import transforms +from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler import utils -from sampler import DistributedSampler, UniformClipSampler, RandomClipSampler + from scheduler import WarmupMultiStepLR import transforms as T From a77a78e088a6b2b5dc77ab670c0d7907a3b681fc Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Thu, 3 Oct 2019 18:03:19 +0200 Subject: [PATCH 3/4] Skip test if pyav not available --- test/test_datasets_samplers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_datasets_samplers.py b/test/test_datasets_samplers.py index 904b547375a..752af54afa3 100644 --- a/test/test_datasets_samplers.py +++ b/test/test_datasets_samplers.py @@ -33,6 +33,7 @@ def get_list_of_videos(num_videos=5, sizes=None, fps=None): yield names +@unittest.skipIf(not io.video._av_available(), "this test requires av") class Tester(unittest.TestCase): def test_random_clip_sampler(self): with get_list_of_videos(num_videos=3, sizes=[25, 25, 25]) as video_list: From 8116255fa0ec072cf10d0be826a5ec78fc70b1c9 Mon Sep 17 00:00:00 2001 From: zyan3 Date: Thu, 3 Oct 2019 14:45:06 -0700 Subject: [PATCH 4/4] change interpolation from round() to floor() as round(0.5) behaves differently between py2 and py3 --- test/test_datasets_samplers.py | 2 +- torchvision/datasets/samplers/clip_sampler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_datasets_samplers.py b/test/test_datasets_samplers.py index 752af54afa3..f99c63e65d3 100644 --- a/test/test_datasets_samplers.py +++ b/test/test_datasets_samplers.py @@ -81,7 +81,7 @@ def test_uniform_clip_sampler_insufficient_clips(self): sampler = UniformClipSampler(video_clips, 3) self.assertEqual(len(sampler), 3 * 3) indices = torch.tensor(list(iter(sampler))) - self.assertTrue(indices.equal(torch.tensor([0, 1, 1, 2, 4, 6, 7, 9, 11]))) + self.assertTrue(indices.equal(torch.tensor([0, 0, 1, 2, 4, 6, 7, 9, 11]))) if __name__ == '__main__': diff --git a/torchvision/datasets/samplers/clip_sampler.py b/torchvision/datasets/samplers/clip_sampler.py index 07731bb5141..3d4c788fc61 100644 --- a/torchvision/datasets/samplers/clip_sampler.py +++ b/torchvision/datasets/samplers/clip_sampler.py @@ -87,7 +87,7 @@ def __iter__(self): sampled = ( torch.linspace(s, s + length - 1, steps=self.num_clips_per_video) - .round() + .floor() .to(torch.int64) ) s += length