@@ -88,6 +88,9 @@ def _cfg(url='', **kwargs):
88
88
url = 'https://storage.googleapis.com/vit_models/augreg/'
89
89
'B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz' ,
90
90
input_size = (3 , 384 , 384 ), crop_pct = 1.0 ),
91
+ 'vit_base_patch8_224' : _cfg (
92
+ url = 'https://storage.googleapis.com/vit_models/augreg/'
93
+ 'B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz' ),
91
94
'vit_large_patch32_224' : _cfg (
92
95
url = '' , # no official model weights for this combo, only for in21k
93
96
),
@@ -118,6 +121,9 @@ def _cfg(url='', **kwargs):
118
121
'vit_base_patch16_224_in21k' : _cfg (
119
122
url = 'https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz' ,
120
123
num_classes = 21843 ),
124
+ 'vit_base_patch8_224_in21k' : _cfg (
125
+ url = 'https://storage.googleapis.com/vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz' ,
126
+ num_classes = 21843 ),
121
127
'vit_large_patch32_224_in21k' : _cfg (
122
128
url = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth' ,
123
129
num_classes = 21843 ),
@@ -640,6 +646,16 @@ def vit_base_patch16_384(pretrained=False, **kwargs):
640
646
return model
641
647
642
648
649
+ @register_model
650
+ def vit_base_patch8_224 (pretrained = False , ** kwargs ):
651
+ """ ViT-Base (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929).
652
+ ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
653
+ """
654
+ model_kwargs = dict (patch_size = 8 , embed_dim = 768 , depth = 12 , num_heads = 12 , ** kwargs )
655
+ model = _create_vision_transformer ('vit_base_patch8_224' , pretrained = pretrained , ** model_kwargs )
656
+ return model
657
+
658
+
643
659
@register_model
644
660
def vit_large_patch32_224 (pretrained = False , ** kwargs ):
645
661
""" ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
@@ -756,6 +772,18 @@ def vit_base_patch16_224_in21k(pretrained=False, **kwargs):
756
772
return model
757
773
758
774
775
+ @register_model
776
+ def vit_base_patch8_224_in21k (pretrained = False , ** kwargs ):
777
+ """ ViT-Base model (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929).
778
+ ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
779
+ NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer
780
+ """
781
+ model_kwargs = dict (
782
+ patch_size = 8 , embed_dim = 768 , depth = 12 , num_heads = 12 , ** kwargs )
783
+ model = _create_vision_transformer ('vit_base_patch8_224_in21k' , pretrained = pretrained , ** model_kwargs )
784
+ return model
785
+
786
+
759
787
@register_model
760
788
def vit_large_patch32_224_in21k (pretrained = False , ** kwargs ):
761
789
""" ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
0 commit comments