@@ -121,8 +121,6 @@ def shifted_window_attention(
121
121
window_size : List [int ],
122
122
num_heads : int ,
123
123
shift_size : List [int ],
124
- attention_dropout : float = 0.0 ,
125
- dropout : float = 0.0 ,
126
124
qkv_bias : Optional [Tensor ] = None ,
127
125
proj_bias : Optional [Tensor ] = None ,
128
126
logit_scale : Optional [torch .Tensor ] = None ,
@@ -138,8 +136,6 @@ def shifted_window_attention(
138
136
window_size (List[int]): Window size.
139
137
num_heads (int): Number of attention heads.
140
138
shift_size (List[int]): Shift size for shifted window attention.
141
- attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
142
- dropout (float): Dropout ratio of output. Default: 0.0.
143
139
qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
144
140
proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
145
141
logit_scale (Tensor[out_dim], optional): Logit scale of cosine attention for Swin Transformer V2. Default: None.
@@ -206,11 +202,9 @@ def shifted_window_attention(
206
202
attn = attn .view (- 1 , num_heads , x .size (1 ), x .size (1 ))
207
203
208
204
attn = F .softmax (attn , dim = - 1 )
209
- attn = F .dropout (attn , p = attention_dropout )
210
205
211
206
x = attn .matmul (v ).transpose (1 , 2 ).reshape (x .size (0 ), x .size (1 ), C )
212
207
x = F .linear (x , proj_weight , proj_bias )
213
- x = F .dropout (x , p = dropout )
214
208
215
209
# reverse windows
216
210
x = x .view (B , pad_H // window_size [0 ], pad_W // window_size [1 ], window_size [0 ], window_size [1 ], C )
@@ -241,17 +235,13 @@ def __init__(
241
235
num_heads : int ,
242
236
qkv_bias : bool = True ,
243
237
proj_bias : bool = True ,
244
- attention_dropout : float = 0.0 ,
245
- dropout : float = 0.0 ,
246
238
):
247
239
super ().__init__ ()
248
240
if len (window_size ) != 2 or len (shift_size ) != 2 :
249
241
raise ValueError ("window_size and shift_size must be of length 2" )
250
242
self .window_size = window_size
251
243
self .shift_size = shift_size
252
244
self .num_heads = num_heads
253
- self .attention_dropout = attention_dropout
254
- self .dropout = dropout
255
245
256
246
self .qkv = nn .Linear (dim , dim * 3 , bias = qkv_bias )
257
247
self .proj = nn .Linear (dim , dim , bias = proj_bias )
@@ -301,8 +291,6 @@ def forward(self, x: Tensor):
301
291
self .window_size ,
302
292
self .num_heads ,
303
293
shift_size = self .shift_size ,
304
- attention_dropout = self .attention_dropout ,
305
- dropout = self .dropout ,
306
294
qkv_bias = self .qkv .bias ,
307
295
proj_bias = self .proj .bias ,
308
296
)
@@ -321,8 +309,6 @@ def __init__(
321
309
num_heads : int ,
322
310
qkv_bias : bool = True ,
323
311
proj_bias : bool = True ,
324
- attention_dropout : float = 0.0 ,
325
- dropout : float = 0.0 ,
326
312
pretrained_window_size : Optional [List [int ]] = None ,
327
313
):
328
314
if pretrained_window_size is None :
@@ -335,8 +321,6 @@ def __init__(
335
321
num_heads ,
336
322
qkv_bias = qkv_bias ,
337
323
proj_bias = proj_bias ,
338
- attention_dropout = attention_dropout ,
339
- dropout = dropout ,
340
324
)
341
325
342
326
self .logit_scale = nn .Parameter (torch .log (10 * torch .ones ((num_heads , 1 , 1 ))))
@@ -391,8 +375,6 @@ def forward(self, x: Tensor):
391
375
self .window_size ,
392
376
self .num_heads ,
393
377
shift_size = self .shift_size ,
394
- attention_dropout = self .attention_dropout ,
395
- dropout = self .dropout ,
396
378
qkv_bias = self .qkv .bias ,
397
379
proj_bias = self .proj .bias ,
398
380
logit_scale = self .logit_scale ,
@@ -408,8 +390,6 @@ class SwinTransformerBlock(nn.Module):
408
390
window_size (List[int]): Window size.
409
391
shift_size (List[int]): Shift size for shifted window attention.
410
392
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
411
- dropout (float): Dropout rate. Default: 0.0.
412
- attention_dropout (float): Attention dropout rate. Default: 0.0.
413
393
stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
414
394
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
415
395
attn_layer (nn.Module): Attention layer. Default: ShiftedWindowAttention
@@ -422,8 +402,6 @@ def __init__(
422
402
window_size : List [int ],
423
403
shift_size : List [int ],
424
404
mlp_ratio : float = 4.0 ,
425
- dropout : float = 0.0 ,
426
- attention_dropout : float = 0.0 ,
427
405
stochastic_depth_prob : float = 0.0 ,
428
406
norm_layer : Callable [..., nn .Module ] = nn .LayerNorm ,
429
407
attn_layer : Callable [..., nn .Module ] = ShiftedWindowAttention ,
@@ -438,13 +416,11 @@ def __init__(
438
416
window_size ,
439
417
shift_size ,
440
418
num_heads ,
441
- attention_dropout = attention_dropout ,
442
- dropout = dropout ,
443
419
** kwargs ,
444
420
)
445
421
self .stochastic_depth = StochasticDepth (stochastic_depth_prob , "row" )
446
422
self .norm2 = norm_layer (dim )
447
- self .mlp = MLP (dim , [int (dim * mlp_ratio ), dim ], activation_layer = nn .GELU , inplace = None , dropout = dropout )
423
+ self .mlp = MLP (dim , [int (dim * mlp_ratio ), dim ], activation_layer = nn .GELU , inplace = None )
448
424
449
425
for m in self .mlp .modules ():
450
426
if isinstance (m , nn .Linear ):
@@ -467,8 +443,6 @@ class SwinTransformerBlockV2(SwinTransformerBlock):
467
443
window_size (List[int]): Window size.
468
444
shift_size (List[int]): Shift size for shifted window attention.
469
445
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
470
- dropout (float): Dropout rate. Default: 0.0.
471
- attention_dropout (float): Attention dropout rate. Default: 0.0.
472
446
stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
473
447
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
474
448
attn_layer (nn.Module): Attention layer. Default: ShiftedWindowAttentionV2.
@@ -482,8 +456,6 @@ def __init__(
482
456
window_size : List [int ],
483
457
shift_size : List [int ],
484
458
mlp_ratio : float = 4.0 ,
485
- dropout : float = 0.0 ,
486
- attention_dropout : float = 0.0 ,
487
459
stochastic_depth_prob : float = 0.0 ,
488
460
norm_layer : Callable [..., nn .Module ] = nn .LayerNorm ,
489
461
attn_layer : Callable [..., nn .Module ] = ShiftedWindowAttentionV2 ,
@@ -495,8 +467,6 @@ def __init__(
495
467
window_size ,
496
468
shift_size ,
497
469
mlp_ratio = mlp_ratio ,
498
- dropout = dropout ,
499
- attention_dropout = attention_dropout ,
500
470
stochastic_depth_prob = stochastic_depth_prob ,
501
471
norm_layer = norm_layer ,
502
472
attn_layer = attn_layer ,
@@ -520,8 +490,6 @@ class SwinTransformer(nn.Module):
520
490
num_heads (List(int)): Number of attention heads in different layers.
521
491
window_size (List[int]): Window size.
522
492
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
523
- dropout (float): Dropout rate. Default: 0.0.
524
- attention_dropout (float): Attention dropout rate. Default: 0.0.
525
493
stochastic_depth_prob (float): Stochastic depth rate. Default: 0.1.
526
494
num_classes (int): Number of classes for classification head. Default: 1000.
527
495
block (nn.Module, optional): SwinTransformer Block. Default: None.
@@ -537,8 +505,6 @@ def __init__(
537
505
num_heads : List [int ],
538
506
window_size : List [int ],
539
507
mlp_ratio : float = 4.0 ,
540
- dropout : float = 0.0 ,
541
- attention_dropout : float = 0.0 ,
542
508
stochastic_depth_prob : float = 0.1 ,
543
509
num_classes : int = 1000 ,
544
510
block : Callable [..., nn .Module ] = SwinTransformerBlock ,
@@ -584,8 +550,6 @@ def __init__(
584
550
window_size = window_size ,
585
551
shift_size = [0 if i_layer % 2 == 0 else w // 2 for w in window_size ],
586
552
mlp_ratio = mlp_ratio ,
587
- dropout = dropout ,
588
- attention_dropout = attention_dropout ,
589
553
stochastic_depth_prob = sd_prob ,
590
554
norm_layer = norm_layer ,
591
555
** kwargs ,
0 commit comments