-
Notifications
You must be signed in to change notification settings - Fork 3
/
models_convit.py
65 lines (55 loc) · 2.13 KB
/
models_convit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Copyright (c) 2022 Alpha-VL
# References:
# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
# DeiT: https://github.com/facebookresearch/deit
# MAE: https://github.com/facebookresearch/mae
# ConvMAE: https://github.com/Alpha-VL/ConvMAE
# --------------------------------------------------------
from functools import partial
import torch
import torch.nn as nn
import pdb
import vision_transformer
class ConvViT(vision_transformer.ConvViT):
""" Vision Transformer with support for global average pooling
"""
def __init__(self, global_pool=False, **kwargs):
super(ConvViT, self).__init__(**kwargs)
self.global_pool = global_pool
if self.global_pool:
norm_layer = kwargs['norm_layer']
embed_dim = kwargs['embed_dim']
self.fc_norm = norm_layer(embed_dim[-1])
del self.norm # remove the original norm
def forward_features(self, x):
B = x.shape[0]
x = self.patch_embed1(x)
x = self.pos_drop(x)
for blk in self.blocks1:
x = blk(x)
x = self.patch_embed2(x)
for blk in self.blocks2:
x = blk(x)
x = self.patch_embed3(x)
x = x.flatten(2).permute(0, 2, 1)
x = self.patch_embed4(x)
x = x + self.pos_embed
for blk in self.blocks3:
x = blk(x)
if self.global_pool:
x = x[:, :, :].mean(dim=1) # global pool without cls token
outcome = self.fc_norm(x)
else:
x = self.norm(x)
outcome = x[:, 0]
return outcome
def convit_base_patch16(**kwargs):
model = ConvViT(
img_size=[224, 56, 28], patch_size=[4, 2, 2], embed_dim=[256, 384, 768], depth=[2, 2, 11], num_heads=12, mlp_ratio=[8, 8, 4], qkv_bias=True,
norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
return model
def convit_large_patch16(**kwargs):
model = ConvViT(
img_size=[224, 56, 28], patch_size=[4, 2, 2], embed_dim=[384, 768, 1024], depth=[2, 2, 23], num_heads=16, mlp_ratio=[8, 8, 4], qkv_bias=True,
norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
return model