import paddle
import paddle.nn as nn
paddle.set_device('cpu')
class Patch_embedding(nn.Layer):
def __init__(self,embed_dim,in_channels,patch_size,image_size):
super().__init__()
n_p=(image_size//patch_size)*(image_size//patch_size)
self.class_token=paddle.create_parameter(shape=[1,1,embed_dim],dtype='float32',
default_initializer=nn.initializer.Constant(0))
self.distll_token=paddle.create_parameter(shape=[1,1,embed_dim],dtype='float32',
default_initializer=nn.initializer.TruncatedNormal(std=.02))
self.position_embedding=paddle.create_parameter(shape=[1,n_p+2,embed_dim],dtype='float32',
default_initializer=nn.initializer.TruncatedNormal(std=.04))
self.patch_embedding=nn.Conv2D(in_channels,embed_dim,kernel_size=patch_size,stride=patch_size)
def forward(self,x):
class_tokens=self.class_token.expand([x.shape[0],-1,-1])
distill_tokens=self.distll_token.expand([x.shape[0],-1,-1])
x=self.patch_embedding(x)
x=x.flatten(2)
x=x.transpose([0,2,1])
x=paddle.concat([class_tokens,distill_tokens,x],axis=1)
x=x+self.position_embedding
return x
class Encode(nn.Layer):
def __init__(self,embed_dim,num_heads=4):
super().__init__()
self.attn=Attention(embed_dim,num_heads)
self.attn_norm=nn.LayerNorm(embed_dim)
self.mlp=Mlp(embed_dim)
self.mlp_norm=nn.LayerNorm(embed_dim)
def forward(self,x):
h=x
x=self.attn_norm(x)
x=self.attn(x)
x=h+x
h=x
x=self.mlp_norm(x)
x=self.mlp(x)
x=h+x
return x
class Encodes(nn.Layer):
def __init__(self,embed_dim,deep=3):
super().__init__()
self.encode=Encode(embed_dim)
self.deep=deep
def forward(self,x):
for i in range(self.deep):
x=self.encode(x)
return x[:,0],x[:,1]
class Attention(nn.Layer):
def __init__(self,embed_dim,num_heads,qkv_bias=None,qk_scalar=None):
super().__init__()
self.num_heads=num_heads
self.head_dim=int(embed_dim/num_heads)
self.all_head_dim=self.head_dim*num_heads
self.qkv=nn.Linear(embed_dim,self.all_head_dim*3,bias_attr=False if qkv_bias==None else qkv_bias)
self.scalar=self.head_dim**-0.5 if qk_scalar==None else qk_scalar
self.softmax=nn.Softmax(-1)
self.pro=nn.Linear(self.all_head_dim,embed_dim)
def transpose_head_dim(self,x):
new_shape=x.shape[:-1]+[self.num_heads,self.head_dim]
x=x.reshape(new_shape)
x=x.transpose([0,2,1,3])
return x
def forward(self,x):
qkv=self.qkv(x).chunk(3,-1)
q,k,v=map(self.transpose_head_dim,qkv)
attn=paddle.matmul(q,k,transpose_y=True)
attn=attn+self.scalar
attn=self.softmax(attn)
out=paddle.matmul(attn,v)
out=out.transpose([0,2,1,3])
out=out.flatten(2)
out=self.pro(out)
return out
class Mlp(nn.Layer):
def __init__(self,embed_dim,mlp_ratios=4.0,dropout=0.):
super().__init__()
self.fc1=nn.Linear(embed_dim,int(mlp_ratios*embed_dim))
self.fc2=nn.Linear(int(mlp_ratios*embed_dim),embed_dim)
self.act=nn.GELU()
self.dropout=nn.Dropout(dropout)
def forward(self,x):
x=self.fc1(x)
x=self.act(x)
x=self.dropout(x)
x=self.fc2(x)
x=self.dropout(x)
return x
class DeiT(nn.Layer):
def __init__(self,embed_dim,in_channels,patch_size,image_size,num_classes):
super().__init__()
self.patch_embedding=Patch_embedding(embed_dim,in_channels,patch_size,image_size)
self.encodes=Encodes(embed_dim)
self.head=nn.Linear(embed_dim,num_classes)
self.distill_head=nn.Linear(embed_dim,num_classes)
def forward(self,x):
x=self.patch_embedding(x)
x,x_distill=self.encodes(x)
x=self.head(x)
x_distill=self.distill_head(x_distill)
if self.training:
return x,x_distill
else:
return (x+x_distill)/2
def main():
model=DeiT(768,3,16,224,1000)
print(model)
paddle.summary(model,(4,3,224,224))
if __name__=='__main__':
main()
import paddle.nn as nn
paddle.set_device('cpu')
class Patch_embedding(nn.Layer):
def __init__(self,embed_dim,in_channels,patch_size,image_size):
super().__init__()
n_p=(image_size//patch_size)*(image_size//patch_size)
self.class_token=paddle.create_parameter(shape=[1,1,embed_dim],dtype='float32',
default_initializer=nn.initializer.Constant(0))
self.distll_token=paddle.create_parameter(shape=[1,1,embed_dim],dtype='float32',
default_initializer=nn.initializer.TruncatedNormal(std=.02))
self.position_embedding=paddle.create_parameter(shape=[1,n_p+2,embed_dim],dtype='float32',
default_initializer=nn.initializer.TruncatedNormal(std=.04))
self.patch_embedding=nn.Conv2D(in_channels,embed_dim,kernel_size=patch_size,stride=patch_size)
def forward(self,x):
class_tokens=self.class_token.expand([x.shape[0],-1,-1])
distill_tokens=self.distll_token.expand([x.shape[0],-1,-1])
x=self.patch_embedding(x)
x=x.flatten(2)
x=x.transpose([0,2,1])
x=paddle.concat([class_tokens,distill_tokens,x],axis=1)
x=x+self.position_embedding
return x
class Encode(nn.Layer):
def __init__(self,embed_dim,num_heads=4):
super().__init__()
self.attn=Attention(embed_dim,num_heads)
self.attn_norm=nn.LayerNorm(embed_dim)
self.mlp=Mlp(embed_dim)
self.mlp_norm=nn.LayerNorm(embed_dim)
def forward(self,x):
h=x
x=self.attn_norm(x)
x=self.attn(x)
x=h+x
h=x
x=self.mlp_norm(x)
x=self.mlp(x)
x=h+x
return x
class Encodes(nn.Layer):
def __init__(self,embed_dim,deep=3):
super().__init__()
self.encode=Encode(embed_dim)
self.deep=deep
def forward(self,x):
for i in range(self.deep):
x=self.encode(x)
return x[:,0],x[:,1]
class Attention(nn.Layer):
def __init__(self,embed_dim,num_heads,qkv_bias=None,qk_scalar=None):
super().__init__()
self.num_heads=num_heads
self.head_dim=int(embed_dim/num_heads)
self.all_head_dim=self.head_dim*num_heads
self.qkv=nn.Linear(embed_dim,self.all_head_dim*3,bias_attr=False if qkv_bias==None else qkv_bias)
self.scalar=self.head_dim**-0.5 if qk_scalar==None else qk_scalar
self.softmax=nn.Softmax(-1)
self.pro=nn.Linear(self.all_head_dim,embed_dim)
def transpose_head_dim(self,x):
new_shape=x.shape[:-1]+[self.num_heads,self.head_dim]
x=x.reshape(new_shape)
x=x.transpose([0,2,1,3])
return x
def forward(self,x):
qkv=self.qkv(x).chunk(3,-1)
q,k,v=map(self.transpose_head_dim,qkv)
attn=paddle.matmul(q,k,transpose_y=True)
attn=attn+self.scalar
attn=self.softmax(attn)
out=paddle.matmul(attn,v)
out=out.transpose([0,2,1,3])
out=out.flatten(2)
out=self.pro(out)
return out
class Mlp(nn.Layer):
def __init__(self,embed_dim,mlp_ratios=4.0,dropout=0.):
super().__init__()
self.fc1=nn.Linear(embed_dim,int(mlp_ratios*embed_dim))
self.fc2=nn.Linear(int(mlp_ratios*embed_dim),embed_dim)
self.act=nn.GELU()
self.dropout=nn.Dropout(dropout)
def forward(self,x):
x=self.fc1(x)
x=self.act(x)
x=self.dropout(x)
x=self.fc2(x)
x=self.dropout(x)
return x
class DeiT(nn.Layer):
def __init__(self,embed_dim,in_channels,patch_size,image_size,num_classes):
super().__init__()
self.patch_embedding=Patch_embedding(embed_dim,in_channels,patch_size,image_size)
self.encodes=Encodes(embed_dim)
self.head=nn.Linear(embed_dim,num_classes)
self.distill_head=nn.Linear(embed_dim,num_classes)
def forward(self,x):
x=self.patch_embedding(x)
x,x_distill=self.encodes(x)
x=self.head(x)
x_distill=self.distill_head(x_distill)
if self.training:
return x,x_distill
else:
return (x+x_distill)/2
def main():
model=DeiT(768,3,16,224,1000)
print(model)
paddle.summary(model,(4,3,224,224))
if __name__=='__main__':
main()