From 5802e68c3724c75e77da9ac524fc4aa510f4561e Mon Sep 17 00:00:00 2001 From: Adam Tupper Date: Fri, 8 Sep 2023 16:10:11 -0400 Subject: [PATCH] [Update] Simplify the dependencies (#157) * Remove sub-dependencies from requirements. Specify minimum versions of core dependencies. * Update the change log. --------- Co-authored-by: Adam Tupper --- CHANGE_LOG.md | 1 + requirements.txt | 69 ++------- semilearn/nets/vit/vit.py | 285 +++++++++++++++++++++++++++----------- 3 files changed, 215 insertions(+), 140 deletions(-) diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index ad22f311c..c1e12606b 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -1,5 +1,6 @@ # Change Log +* Simplified and improved the flexibility of the dependencies. * Fixes hard-coded repository path for Aim experiment tracking. ## 23/07/15/2023 Update diff --git a/requirements.txt b/requirements.txt index f656c0935..56cc61764 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,61 +1,18 @@ -absl-py==1.1.0 -cachetools==5.2.0 -certifi==2021.5.30 -charset-normalizer==2.1.0 -cycler==0.11.0 -filelock==3.7.1 -fonttools==4.33.3 -google-auth==2.9.0 -google-auth-oauthlib==0.4.6 -grpcio==1.53.0 -huggingface-hub==0.14.1 -idna==3.3 -imageio==2.19.3 -importlib-metadata==4.12.0 -joblib==1.2.0 -kiwisolver==1.4.3 -Markdown==3.3.7 -matplotlib==3.5.2 -networkx==2.6.3 -numpy -oauthlib==3.2.1 -olefile==0.46 -packaging==21.3 -Pillow==9.0.0 -progress==1.6 -protobuf==3.19.5 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pyparsing==3.0.9 -python-dateutil==2.8.2 -PyWavelets==1.3.0 -PyYAML==6.0 -regex==2022.6.2 -requests==2.31.0 -requests-oauthlib==1.3.1 -rsa==4.8 -ruamel.yaml==0.17.21 -ruamel.yaml.clib==0.2.6 -scikit-image==0.19.3 -scikit-learn==1.0.2 -scipy==1.10.0 -six==1.16.0 -tensorboard==2.9.1 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.1 -threadpoolctl==3.1.0 -tifffile==2021.11.2 -timm==0.5.4 -tokenizers==0.12.1 +matplotlib>=3.5.2 +numpy>=1.24.4 +Pillow>=9.0.0 +progress>=1.6 +ruamel.yaml>=0.17.21 +ruamel.yaml.clib>=0.2.6 +scikit-image>=0.19.3 +scikit-learn>=1.0.2 +scipy>=1.10.0 +tensorboard>=2.9.1 +timm>=0.5.4 torch>=1.12.0 torchaudio>=0.12.0 torchvision>=0.13.0 -tqdm==4.64.0 -transformers==4.30.0 -typing-extensions==4.3.0 -urllib3==1.26.9 -Werkzeug==2.1.2 -zipp==3.8.0 -pynacl +tqdm>=4.64.0 +transformers>=4.30.0 wandb aim diff --git a/semilearn/nets/vit/vit.py b/semilearn/nets/vit/vit.py index 84b5aa327..4044f1f8b 100644 --- a/semilearn/nets/vit/vit.py +++ b/semilearn/nets/vit/vit.py @@ -1,24 +1,27 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import math from functools import partial import torch import torch.nn as nn -import torch.nn.functional as F import torch.utils.checkpoint - -from timm.models.layers import DropPath, trunc_normal_ -from timm.models.layers.helpers import to_2tuple - from semilearn.nets.utils import load_checkpoint +from timm.models.layers import DropPath, to_2tuple class PatchEmbed(nn.Module): - """ 2D Image to Patch Embedding - """ - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True): + """2D Image to Patch Embedding""" + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True, + ): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) @@ -28,13 +31,12 @@ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_ self.num_patches = self.grid_size[0] * self.grid_size[1] self.flatten = flatten - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() def forward(self, x): - B, C, H, W = x.shape - # assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).") - # assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).") x = self.proj(x) if self.flatten: x = x.flatten(2).transpose(1, 2) # BCHW -> BNC @@ -43,9 +45,16 @@ def forward(self, x): class Mlp(nn.Module): - """ MLP as used in Vision Transformer, MLP-Mixer and related networks - """ - def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + """MLP as used in Vision Transformer, MLP-Mixer and related networks""" + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features @@ -67,12 +76,12 @@ def forward(self, x): class Attention(nn.Module): - def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0): super().__init__() - assert dim % num_heads == 0, 'dim should be divisible by num_heads' + assert dim % num_heads == 0, "dim should be divisible by num_heads" self.num_heads = num_heads head_dim = dim // num_heads - self.scale = head_dim ** -0.5 + self.scale = head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) @@ -81,8 +90,12 @@ def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.) def forward(self, x): B, N, C = x.shape - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) - q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) @@ -105,22 +118,47 @@ def forward(self, x): class Block(nn.Module): - def __init__( - self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., init_values=None, - drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + drop=0.0, + attn_drop=0.0, + init_values=None, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): super().__init__() self.norm1 = norm_layer(dim) - self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) - self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.ls1 = ( + LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + ) + # NOTE: drop path for stochastic depth, we shall see if this is better than + # dropout here + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) - self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() - self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + self.ls2 = ( + LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + ) + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() def forward(self, x): x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x)))) @@ -128,31 +166,48 @@ def forward(self, x): return x - class VisionTransformer(nn.Module): - """ Vision Transformer - A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - - https://arxiv.org/abs/2010.11929 + """Vision Transformer + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image + Recognition at Scale` (https://arxiv.org/abs/2010.11929) """ def __init__( - self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token', - embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, - drop_rate=0., attn_drop_rate=0., drop_path_rate=0., init_values=None, - embed_layer=PatchEmbed, norm_layer=None, act_layer=None, block_fn=Block): + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + global_pool="token", + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + init_values=None, + embed_layer=PatchEmbed, + norm_layer=None, + act_layer=None, + block_fn=Block, + ): """ Args: img_size (int, tuple): input image size patch_size (int, tuple): patch size in_chans (int): number of input channels num_classes (int): number of classes for classification head - global_pool (str): type of global pooling for final sequence (default: 'token') + global_pool (str): type of global pooling for final sequence (default: + 'token') embed_dim (int): embedding dimension depth (int): depth of transformer num_heads (int): number of attention heads mlp_ratio (int): ratio of mlp hidden dim to embedding dim qkv_bias (bool): enable bias for qkv if True - representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set + representation_size (Optional[int]): enable and set representation layer + (pre-logits) to this value if set drop_rate (float): dropout rate attn_drop_rate (float): attention dropout rate drop_path_rate (float): stochastic depth rate @@ -163,39 +218,61 @@ def __init__( act_layer: (nn.Module): MLP activation layer """ super().__init__() - assert global_pool in ('', 'avg', 'token') + assert global_pool in ("", "avg", "token") norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) act_layer = act_layer or nn.GELU self.num_classes = num_classes self.global_pool = global_pool - self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_features = ( + self.embed_dim + ) = embed_dim # num_features for consistency with other models self.num_tokens = 1 self.grad_checkpointing = False self.patch_embed = embed_layer( - img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) num_patches = self.patch_embed.num_patches self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) - self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + self.num_tokens, embed_dim) + ) self.pos_drop = nn.Dropout(p=drop_rate) - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule - self.blocks = nn.Sequential(*[ - block_fn( - dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, init_values=init_values, - drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer) - for i in range(depth)]) - use_fc_norm = self.global_pool == 'avg' + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.Sequential( + *[ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ) + for i in range(depth) + ] + ) + use_fc_norm = self.global_pool == "avg" self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity() # Classifier Head self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity() self.num_features = self.embed_dim - self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() - - + self.head = ( + nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + ) def extract(self, x): x = self.patch_embed(x) @@ -205,7 +282,6 @@ def extract(self, x): x = self.norm(x) return x - def forward(self, x, only_fc=False, only_feat=False, **kwargs): """ Args: @@ -213,35 +289,48 @@ def forward(self, x, only_fc=False, only_feat=False, **kwargs): only_fc: only use classifier, input should be features before classifier only_feat: only return pooled features """ - + if only_fc: return self.head(x) - + x = self.extract(x) if self.global_pool: - x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0] + x = x[:, 1:].mean(dim=1) if self.global_pool == "avg" else x[:, 0] x = self.fc_norm(x) if only_feat: return x output = self.head(x) - result_dict = {'logits':output, 'feat':x} + result_dict = {"logits": output, "feat": x} return result_dict def no_weight_decay(self): - return {'pos_embed', 'cls_token'} - - def group_matcher(self, coarse=False, prefix=''): + return {"pos_embed", "cls_token"} + + def group_matcher(self, coarse=False, prefix=""): return dict( - stem=r'^{}cls_token|{}pos_embed|{}patch_embed'.format(prefix, prefix, prefix), # stem and embed - blocks=[(r'^{}blocks\.(\d+)'.format(prefix), None), (r'^{}norm'.format(prefix), (99999,))] + stem=r"^{}cls_token|{}pos_embed|{}patch_embed".format( + prefix, prefix, prefix + ), # stem and embed + blocks=[ + (r"^{}blocks\.(\d+)".format(prefix), None), + (r"^{}norm".format(prefix), (99999,)), + ], ) + def vit_tiny_patch2_32(pretrained=False, pretrained_path=None, **kwargs): - """ ViT-Tiny (Vit-Ti/2) - """ - model_kwargs = dict(img_size=32, patch_size=2, embed_dim=192, depth=12, num_heads=3, drop_path_rate=0.1, **kwargs) + """ViT-Tiny (Vit-Ti/2)""" + model_kwargs = dict( + img_size=32, + patch_size=2, + embed_dim=192, + depth=12, + num_heads=3, + drop_path_rate=0.1, + **kwargs + ) model = VisionTransformer(**model_kwargs) if pretrained: model = load_checkpoint(model, pretrained_path) @@ -250,9 +339,16 @@ def vit_tiny_patch2_32(pretrained=False, pretrained_path=None, **kwargs): def vit_small_patch2_32(pretrained=False, pretrained_path=None, **kwargs): - """ ViT-Small (ViT-S/2) - """ - model_kwargs = dict(img_size=32, patch_size=2, embed_dim=384, depth=12, num_heads=6, drop_path_rate=0.2, **kwargs) + """ViT-Small (ViT-S/2)""" + model_kwargs = dict( + img_size=32, + patch_size=2, + embed_dim=384, + depth=12, + num_heads=6, + drop_path_rate=0.2, + **kwargs + ) model = VisionTransformer(**model_kwargs) if pretrained: model = load_checkpoint(model, pretrained_path) @@ -260,32 +356,53 @@ def vit_small_patch2_32(pretrained=False, pretrained_path=None, **kwargs): def vit_small_patch16_224(pretrained=False, pretrained_path=None, **kwargs): - """ ViT-Small (ViT-S/16) - """ - model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, drop_path_rate=0.2, **kwargs) + """ViT-Small (ViT-S/16)""" + model_kwargs = dict( + patch_size=16, + embed_dim=384, + depth=12, + num_heads=6, + drop_path_rate=0.2, + **kwargs + ) model = VisionTransformer(**model_kwargs) if pretrained: - model = load_checkpoint(model, pretrained_path) + model = load_checkpoint(model, pretrained_path) return model def vit_base_patch16_96(pretrained=False, pretrained_path=None, **kwargs): - """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). - ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. + """ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. # noqa: E501 """ - model_kwargs = dict(img_size=96, patch_size=16, embed_dim=768, depth=12, num_heads=12, drop_path_rate=0.2, **kwargs) + model_kwargs = dict( + img_size=96, + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + drop_path_rate=0.2, + **kwargs + ) model = VisionTransformer(**model_kwargs) if pretrained: - model = load_checkpoint(model, pretrained_path) + model = load_checkpoint(model, pretrained_path) return model def vit_base_patch16_224(pretrained=False, pretrained_path=None, **kwargs): - """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). - ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. + """ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. # noqa: E501 """ - model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, drop_path_rate=0.2, **kwargs) + model_kwargs = dict( + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + drop_path_rate=0.2, + **kwargs + ) model = VisionTransformer(**model_kwargs) if pretrained: - model = load_checkpoint(model, pretrained_path) - return model \ No newline at end of file + model = load_checkpoint(model, pretrained_path) + return model