pytorch
diff --git a/‎docs/source/models.rst
Lines changed: 1 addition & 0 deletions b/‎docs/source/models.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/models/swin_transformer.rst
Lines changed: 1 addition & 1 deletion b/‎docs/source/models/swin_transformer.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/models/video_swin_transformer.rst
Lines changed: 27 additions & 0 deletions b/‎docs/source/models/video_swin_transformer.rst
Lines changed: 27 additions & 0 deletions
diff --git a/‎test/expect/ModelTester.test_swin3d_b_expect.pkl
1.05 KB b/‎test/expect/ModelTester.test_swin3d_b_expect.pkl
1.05 KB
diff --git a/‎test/expect/ModelTester.test_swin3d_s_expect.pkl
1.05 KB b/‎test/expect/ModelTester.test_swin3d_s_expect.pkl
1.05 KB
diff --git a/‎test/expect/ModelTester.test_swin3d_t_expect.pkl
1.05 KB b/‎test/expect/ModelTester.test_swin3d_t_expect.pkl
1.05 KB
diff --git a/‎torchvision/models/swin_transformer.py
Lines changed: 3 additions & 1 deletion b/‎torchvision/models/swin_transformer.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎torchvision/models/video/__init__.py
Lines changed: 1 addition & 0 deletions b/‎torchvision/models/video/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -518,6 +518,7 @@ pre-trained weights:
    models/video_mvit
    models/video_resnet
    models/video_s3d
+   models/video_swin_transformer
 
 |
 
 
@@ -15,7 +15,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate an SwinTransformer model (original and V2) with and without pre-trained weights.
-All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer`` 
+All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer``
 base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_ for
 more details about this class.
 
@@ -0,0 +1,27 @@
+Video SwinTransformer
+=====================
+
+.. currentmodule:: torchvision.models.video
+
+The Video SwinTransformer model is based on the `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`__ paper.
+
+.. betastatus:: video module
+
+
+Model builders
+--------------
+
+The following model builders can be used to instantiate a VideoResNet model, with or
+without pre-trained weights. All the model builders internally rely on the
+``torchvision.models.video.swin_transformer.SwinTransformer3d`` base class. Please refer to the `source
+code
+<https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_ for
+more details about this class.
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    swin3d_t
+    swin3d_s
+    swin3d_b
@@ -494,6 +494,8 @@ def __init__(
         )
 
     def forward(self, x: Tensor):
+        # Here is the difference, we apply norm after the attention in V2.
+        # In V1 we applied norm before the attention.
         x = x + self.stochastic_depth(self.norm1(self.attn(x)))
         x = x + self.stochastic_depth(self.norm2(self.mlp(x)))
         return x
@@ -587,7 +589,7 @@ def __init__(
 
         num_features = embed_dim * 2 ** (len(depths) - 1)
         self.norm = norm_layer(num_features)
-        self.permute = Permute([0, 3, 1, 2])
+        self.permute = Permute([0, 3, 1, 2])  # B H W C -> B C H W
         self.avgpool = nn.AdaptiveAvgPool2d(1)
         self.flatten = nn.Flatten(1)
         self.head = nn.Linear(num_features, num_classes)
 
@@ -1,3 +1,4 @@
 from .mvit import *
 from .resnet import *
 from .s3d import *
+from .swin_transformer import *
Original file line number	Diff line number	Diff line change
`@@ -518,6 +518,7 @@ pre-trained weights:`
`518`	`518`	`models/video_mvit`
`519`	`519`	`models/video_resnet`
`520`	`520`	`models/video_s3d`
	`521`	`+ models/video_swin_transformer`
`521`	`522`
`522`	`523`	`\|`
`523`	`524`