Skip to content

Commit cc21e13

Browse files
NicolasHugfacebook-github-bot
authored andcommitted
[fbsync] Adding video accuracy for video_classification reference script (#6241)
Summary: * Add ensembled video accuracy on video reference script * Change the parser func to be similar with classification reference * Fix typo type->dtype * Use custom kinetics * Fix dataset to not getting start_pts * Change dataset name, and put video_idx at the back * Ufmt format * Use functional softmax, updating meta and use it to overwrite eval param * Fix typo * Put the eval parameters on the docs for now * Change meta for video resnet to use frame-rate 15, also change wording on docs Reviewed By: jdsgomes Differential Revision: D37993423 fbshipit-source-id: e6ad9fa13c7916d541fb7bc9582650ba9c92b8e0
1 parent 759dab3 commit cc21e13

File tree

5 files changed

+61
-24
lines changed

5 files changed

+61
-24
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from typing import Tuple
2+
3+
import torchvision
4+
from torch import Tensor
5+
6+
7+
class KineticsWithVideoId(torchvision.datasets.Kinetics):
8+
def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
9+
video, audio, info, video_idx = self.video_clips.get_clip(idx)
10+
label = self.samples[video_idx][1]
11+
12+
if self.transform is not None:
13+
video = self.transform(video)
14+
15+
return video, audio, label, video_idx

references/video_classification/train.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import time
44
import warnings
55

6+
import datasets
67
import presets
78
import torch
89
import torch.utils.data
@@ -11,7 +12,7 @@
1112
import utils
1213
from torch import nn
1314
from torch.utils.data.dataloader import default_collate
14-
from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler
15+
from torchvision.datasets.samplers import DistributedSampler, RandomClipSampler, UniformClipSampler
1516

1617

1718
def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq, scaler=None):
@@ -21,7 +22,7 @@ def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, devi
2122
metric_logger.add_meter("clips/s", utils.SmoothedValue(window_size=10, fmt="{value:.3f}"))
2223

2324
header = f"Epoch: [{epoch}]"
24-
for video, target in metric_logger.log_every(data_loader, print_freq, header):
25+
for video, target, _ in metric_logger.log_every(data_loader, print_freq, header):
2526
start_time = time.time()
2627
video, target = video.to(device), target.to(device)
2728
with torch.cuda.amp.autocast(enabled=scaler is not None):
@@ -52,13 +53,25 @@ def evaluate(model, criterion, data_loader, device):
5253
metric_logger = utils.MetricLogger(delimiter=" ")
5354
header = "Test:"
5455
num_processed_samples = 0
56+
# Group and aggregate output of a video
57+
num_videos = len(data_loader.dataset.samples)
58+
num_classes = len(data_loader.dataset.classes)
59+
agg_preds = torch.zeros((num_videos, num_classes), dtype=torch.float32, device=device)
60+
agg_targets = torch.zeros((num_videos), dtype=torch.int32, device=device)
5561
with torch.inference_mode():
56-
for video, target in metric_logger.log_every(data_loader, 100, header):
62+
for video, target, video_idx in metric_logger.log_every(data_loader, 100, header):
5763
video = video.to(device, non_blocking=True)
5864
target = target.to(device, non_blocking=True)
5965
output = model(video)
6066
loss = criterion(output, target)
6167

68+
# Use softmax to convert output into prediction probability
69+
preds = torch.softmax(output, dim=1)
70+
for b in range(video.size(0)):
71+
idx = video_idx[b].item()
72+
agg_preds[idx] += preds[b].detach()
73+
agg_targets[idx] = target[b].detach().item()
74+
6275
acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
6376
# FIXME need to take into account that the datasets
6477
# could have been padded in distributed setup
@@ -95,6 +108,11 @@ def evaluate(model, criterion, data_loader, device):
95108
top1=metric_logger.acc1, top5=metric_logger.acc5
96109
)
97110
)
111+
# Reduce the agg_preds and agg_targets from all gpu and show result
112+
agg_preds = utils.reduce_across_processes(agg_preds)
113+
agg_targets = utils.reduce_across_processes(agg_targets, op=torch.distributed.ReduceOp.MAX)
114+
agg_acc1, agg_acc5 = utils.accuracy(agg_preds, agg_targets, topk=(1, 5))
115+
print(" * Video Acc@1 {acc1:.3f} Video Acc@5 {acc5:.3f}".format(acc1=agg_acc1, acc5=agg_acc5))
98116
return metric_logger.acc1.global_avg
99117

100118

@@ -110,7 +128,7 @@ def _get_cache_path(filepath, args):
110128

111129
def collate_fn(batch):
112130
# remove audio from the batch
113-
batch = [(d[0], d[2]) for d in batch]
131+
batch = [(d[0], d[2], d[3]) for d in batch]
114132
return default_collate(batch)
115133

116134

@@ -146,7 +164,7 @@ def main(args):
146164
else:
147165
if args.distributed:
148166
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
149-
dataset = torchvision.datasets.Kinetics(
167+
dataset = datasets.KineticsWithVideoId(
150168
args.data_path,
151169
frames_per_clip=args.clip_len,
152170
num_classes=args.kinetics_version,
@@ -183,7 +201,7 @@ def main(args):
183201
else:
184202
if args.distributed:
185203
print("It is recommended to pre-compute the dataset cache on a single-gpu first, as it will be faster")
186-
dataset_test = torchvision.datasets.Kinetics(
204+
dataset_test = datasets.KineticsWithVideoId(
187205
args.data_path,
188206
frames_per_clip=args.clip_len,
189207
num_classes=args.kinetics_version,
@@ -313,10 +331,10 @@ def main(args):
313331
print(f"Training time {total_time_str}")
314332

315333

316-
def parse_args():
334+
def get_args_parser(add_help=True):
317335
import argparse
318336

319-
parser = argparse.ArgumentParser(description="PyTorch Video Classification Training")
337+
parser = argparse.ArgumentParser(description="PyTorch Video Classification Training", add_help=add_help)
320338

321339
parser.add_argument("--data-path", default="/datasets01_101/kinetics/070618/", type=str, help="dataset path")
322340
parser.add_argument(
@@ -387,11 +405,9 @@ def parse_args():
387405
# Mixed precision training parameters
388406
parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")
389407

390-
args = parser.parse_args()
391-
392-
return args
408+
return parser
393409

394410

395411
if __name__ == "__main__":
396-
args = parse_args()
412+
args = get_args_parser().parse_args()
397413
main(args)

references/video_classification/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,12 +253,12 @@ def init_distributed_mode(args):
253253
setup_for_distributed(args.rank == 0)
254254

255255

256-
def reduce_across_processes(val):
256+
def reduce_across_processes(val, op=dist.ReduceOp.SUM):
257257
if not is_dist_avail_and_initialized():
258258
# nothing to sync, but we still convert to tensor for consistency with the distributed case.
259259
return torch.tensor(val)
260260

261261
t = torch.tensor(val, device="cuda")
262262
dist.barrier()
263-
dist.all_reduce(t)
263+
dist.all_reduce(t, op=op)
264264
return t

torchvision/models/video/mvit.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,12 +445,15 @@ class MViT_V1_B_Weights(WeightsEnum):
445445
"min_temporal_size": 16,
446446
"categories": _KINETICS400_CATEGORIES,
447447
"recipe": "https://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.md",
448-
"_docs": """These weights support 16-frame clip inputs and were ported from the paper.""",
448+
"_docs": (
449+
"The weights were ported from the paper. The accuracies are estimated on video-level "
450+
"with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`"
451+
),
449452
"num_params": 36610672,
450453
"_metrics": {
451454
"Kinetics-400": {
452-
"acc@1": 78.47,
453-
"acc@5": 93.65,
455+
"acc@1": 78.477,
456+
"acc@5": 93.582,
454457
}
455458
},
456459
},

torchvision/models/video/resnet.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,10 @@ def _video_resnet(
312312
"min_size": (1, 1),
313313
"categories": _KINETICS400_CATEGORIES,
314314
"recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification",
315-
"_docs": """These weights reproduce closely the accuracy of the paper for 16-frame clip inputs.""",
315+
"_docs": (
316+
"The weights reproduce closely the accuracy of the paper. The accuracies are estimated on video-level "
317+
"with parameters `frame_rate=15`, `clips_per_video=5`, and `clip_len=16`."
318+
),
316319
}
317320

318321

@@ -325,8 +328,8 @@ class R3D_18_Weights(WeightsEnum):
325328
"num_params": 33371472,
326329
"_metrics": {
327330
"Kinetics-400": {
328-
"acc@1": 52.75,
329-
"acc@5": 75.45,
331+
"acc@1": 63.200,
332+
"acc@5": 83.479,
330333
}
331334
},
332335
},
@@ -343,8 +346,8 @@ class MC3_18_Weights(WeightsEnum):
343346
"num_params": 11695440,
344347
"_metrics": {
345348
"Kinetics-400": {
346-
"acc@1": 53.90,
347-
"acc@5": 76.29,
349+
"acc@1": 63.960,
350+
"acc@5": 84.130,
348351
}
349352
},
350353
},
@@ -361,8 +364,8 @@ class R2Plus1D_18_Weights(WeightsEnum):
361364
"num_params": 31505325,
362365
"_metrics": {
363366
"Kinetics-400": {
364-
"acc@1": 57.50,
365-
"acc@5": 78.81,
367+
"acc@1": 67.463,
368+
"acc@5": 86.175,
366369
}
367370
},
368371
},

0 commit comments

Comments
 (0)