Skip to content

Commit 8a13f93

Browse files
committed
deprecate dropout
1 parent e2b338b commit 8a13f93

File tree

1 file changed

+1
-37
lines changed

1 file changed

+1
-37
lines changed

torchvision/models/swin_transformer.py

Lines changed: 1 addition & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,6 @@ def shifted_window_attention(
121121
window_size: List[int],
122122
num_heads: int,
123123
shift_size: List[int],
124-
attention_dropout: float = 0.0,
125-
dropout: float = 0.0,
126124
qkv_bias: Optional[Tensor] = None,
127125
proj_bias: Optional[Tensor] = None,
128126
logit_scale: Optional[torch.Tensor] = None,
@@ -138,8 +136,6 @@ def shifted_window_attention(
138136
window_size (List[int]): Window size.
139137
num_heads (int): Number of attention heads.
140138
shift_size (List[int]): Shift size for shifted window attention.
141-
attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
142-
dropout (float): Dropout ratio of output. Default: 0.0.
143139
qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
144140
proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
145141
logit_scale (Tensor[out_dim], optional): Logit scale of cosine attention for Swin Transformer V2. Default: None.
@@ -206,11 +202,9 @@ def shifted_window_attention(
206202
attn = attn.view(-1, num_heads, x.size(1), x.size(1))
207203

208204
attn = F.softmax(attn, dim=-1)
209-
attn = F.dropout(attn, p=attention_dropout)
210205

211206
x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), C)
212207
x = F.linear(x, proj_weight, proj_bias)
213-
x = F.dropout(x, p=dropout)
214208

215209
# reverse windows
216210
x = x.view(B, pad_H // window_size[0], pad_W // window_size[1], window_size[0], window_size[1], C)
@@ -241,17 +235,13 @@ def __init__(
241235
num_heads: int,
242236
qkv_bias: bool = True,
243237
proj_bias: bool = True,
244-
attention_dropout: float = 0.0,
245-
dropout: float = 0.0,
246238
):
247239
super().__init__()
248240
if len(window_size) != 2 or len(shift_size) != 2:
249241
raise ValueError("window_size and shift_size must be of length 2")
250242
self.window_size = window_size
251243
self.shift_size = shift_size
252244
self.num_heads = num_heads
253-
self.attention_dropout = attention_dropout
254-
self.dropout = dropout
255245

256246
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
257247
self.proj = nn.Linear(dim, dim, bias=proj_bias)
@@ -301,8 +291,6 @@ def forward(self, x: Tensor):
301291
self.window_size,
302292
self.num_heads,
303293
shift_size=self.shift_size,
304-
attention_dropout=self.attention_dropout,
305-
dropout=self.dropout,
306294
qkv_bias=self.qkv.bias,
307295
proj_bias=self.proj.bias,
308296
)
@@ -321,8 +309,6 @@ def __init__(
321309
num_heads: int,
322310
qkv_bias: bool = True,
323311
proj_bias: bool = True,
324-
attention_dropout: float = 0.0,
325-
dropout: float = 0.0,
326312
pretrained_window_size: Optional[List[int]] = None,
327313
):
328314
if pretrained_window_size is None:
@@ -335,8 +321,6 @@ def __init__(
335321
num_heads,
336322
qkv_bias=qkv_bias,
337323
proj_bias=proj_bias,
338-
attention_dropout=attention_dropout,
339-
dropout=dropout,
340324
)
341325

342326
self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
@@ -391,8 +375,6 @@ def forward(self, x: Tensor):
391375
self.window_size,
392376
self.num_heads,
393377
shift_size=self.shift_size,
394-
attention_dropout=self.attention_dropout,
395-
dropout=self.dropout,
396378
qkv_bias=self.qkv.bias,
397379
proj_bias=self.proj.bias,
398380
logit_scale=self.logit_scale,
@@ -408,8 +390,6 @@ class SwinTransformerBlock(nn.Module):
408390
window_size (List[int]): Window size.
409391
shift_size (List[int]): Shift size for shifted window attention.
410392
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
411-
dropout (float): Dropout rate. Default: 0.0.
412-
attention_dropout (float): Attention dropout rate. Default: 0.0.
413393
stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
414394
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
415395
attn_layer (nn.Module): Attention layer. Default: ShiftedWindowAttention
@@ -422,8 +402,6 @@ def __init__(
422402
window_size: List[int],
423403
shift_size: List[int],
424404
mlp_ratio: float = 4.0,
425-
dropout: float = 0.0,
426-
attention_dropout: float = 0.0,
427405
stochastic_depth_prob: float = 0.0,
428406
norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
429407
attn_layer: Callable[..., nn.Module] = ShiftedWindowAttention,
@@ -438,13 +416,11 @@ def __init__(
438416
window_size,
439417
shift_size,
440418
num_heads,
441-
attention_dropout=attention_dropout,
442-
dropout=dropout,
443419
**kwargs,
444420
)
445421
self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
446422
self.norm2 = norm_layer(dim)
447-
self.mlp = MLP(dim, [int(dim * mlp_ratio), dim], activation_layer=nn.GELU, inplace=None, dropout=dropout)
423+
self.mlp = MLP(dim, [int(dim * mlp_ratio), dim], activation_layer=nn.GELU, inplace=None)
448424

449425
for m in self.mlp.modules():
450426
if isinstance(m, nn.Linear):
@@ -467,8 +443,6 @@ class SwinTransformerBlockV2(SwinTransformerBlock):
467443
window_size (List[int]): Window size.
468444
shift_size (List[int]): Shift size for shifted window attention.
469445
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
470-
dropout (float): Dropout rate. Default: 0.0.
471-
attention_dropout (float): Attention dropout rate. Default: 0.0.
472446
stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
473447
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
474448
attn_layer (nn.Module): Attention layer. Default: ShiftedWindowAttentionV2.
@@ -482,8 +456,6 @@ def __init__(
482456
window_size: List[int],
483457
shift_size: List[int],
484458
mlp_ratio: float = 4.0,
485-
dropout: float = 0.0,
486-
attention_dropout: float = 0.0,
487459
stochastic_depth_prob: float = 0.0,
488460
norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
489461
attn_layer: Callable[..., nn.Module] = ShiftedWindowAttentionV2,
@@ -495,8 +467,6 @@ def __init__(
495467
window_size,
496468
shift_size,
497469
mlp_ratio=mlp_ratio,
498-
dropout=dropout,
499-
attention_dropout=attention_dropout,
500470
stochastic_depth_prob=stochastic_depth_prob,
501471
norm_layer=norm_layer,
502472
attn_layer=attn_layer,
@@ -520,8 +490,6 @@ class SwinTransformer(nn.Module):
520490
num_heads (List(int)): Number of attention heads in different layers.
521491
window_size (List[int]): Window size.
522492
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
523-
dropout (float): Dropout rate. Default: 0.0.
524-
attention_dropout (float): Attention dropout rate. Default: 0.0.
525493
stochastic_depth_prob (float): Stochastic depth rate. Default: 0.1.
526494
num_classes (int): Number of classes for classification head. Default: 1000.
527495
block (nn.Module, optional): SwinTransformer Block. Default: None.
@@ -537,8 +505,6 @@ def __init__(
537505
num_heads: List[int],
538506
window_size: List[int],
539507
mlp_ratio: float = 4.0,
540-
dropout: float = 0.0,
541-
attention_dropout: float = 0.0,
542508
stochastic_depth_prob: float = 0.1,
543509
num_classes: int = 1000,
544510
block: Callable[..., nn.Module] = SwinTransformerBlock,
@@ -584,8 +550,6 @@ def __init__(
584550
window_size=window_size,
585551
shift_size=[0 if i_layer % 2 == 0 else w // 2 for w in window_size],
586552
mlp_ratio=mlp_ratio,
587-
dropout=dropout,
588-
attention_dropout=attention_dropout,
589553
stochastic_depth_prob=sd_prob,
590554
norm_layer=norm_layer,
591555
**kwargs,

0 commit comments

Comments
 (0)