From 5802e68c3724c75e77da9ac524fc4aa510f4561e Mon Sep 17 00:00:00 2001
From: Adam Tupper <adam.tupper.1@ulaval.ca>
Date: Fri, 8 Sep 2023 16:10:11 -0400
Subject: [PATCH] [Update] Simplify the dependencies (#157)

* Remove sub-dependencies from requirements. Specify minimum versions of core dependencies.

* Update the change log.

---------

Co-authored-by: Adam Tupper <adam.tupper@outlook.com>
---
 CHANGE_LOG.md             |   1 +
 requirements.txt          |  69 ++-------
 semilearn/nets/vit/vit.py | 285 +++++++++++++++++++++++++++-----------
 3 files changed, 215 insertions(+), 140 deletions(-)

diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md
index ad22f311c..c1e12606b 100644
--- a/CHANGE_LOG.md
+++ b/CHANGE_LOG.md
@@ -1,5 +1,6 @@
 # Change Log
 
+* Simplified and improved the flexibility of the dependencies.
 * Fixes hard-coded repository path for Aim experiment tracking.
 
 ## 23/07/15/2023 Update
diff --git a/requirements.txt b/requirements.txt
index f656c0935..56cc61764 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,61 +1,18 @@
-absl-py==1.1.0
-cachetools==5.2.0
-certifi==2021.5.30
-charset-normalizer==2.1.0
-cycler==0.11.0
-filelock==3.7.1
-fonttools==4.33.3
-google-auth==2.9.0
-google-auth-oauthlib==0.4.6
-grpcio==1.53.0
-huggingface-hub==0.14.1
-idna==3.3
-imageio==2.19.3
-importlib-metadata==4.12.0
-joblib==1.2.0
-kiwisolver==1.4.3
-Markdown==3.3.7
-matplotlib==3.5.2
-networkx==2.6.3
-numpy
-oauthlib==3.2.1
-olefile==0.46
-packaging==21.3
-Pillow==9.0.0
-progress==1.6
-protobuf==3.19.5
-pyasn1==0.4.8
-pyasn1-modules==0.2.8
-pyparsing==3.0.9
-python-dateutil==2.8.2
-PyWavelets==1.3.0
-PyYAML==6.0
-regex==2022.6.2
-requests==2.31.0
-requests-oauthlib==1.3.1
-rsa==4.8
-ruamel.yaml==0.17.21
-ruamel.yaml.clib==0.2.6
-scikit-image==0.19.3
-scikit-learn==1.0.2
-scipy==1.10.0
-six==1.16.0
-tensorboard==2.9.1
-tensorboard-data-server==0.6.1
-tensorboard-plugin-wit==1.8.1
-threadpoolctl==3.1.0
-tifffile==2021.11.2
-timm==0.5.4
-tokenizers==0.12.1
+matplotlib>=3.5.2
+numpy>=1.24.4
+Pillow>=9.0.0
+progress>=1.6
+ruamel.yaml>=0.17.21
+ruamel.yaml.clib>=0.2.6
+scikit-image>=0.19.3
+scikit-learn>=1.0.2
+scipy>=1.10.0
+tensorboard>=2.9.1
+timm>=0.5.4
 torch>=1.12.0
 torchaudio>=0.12.0
 torchvision>=0.13.0
-tqdm==4.64.0
-transformers==4.30.0
-typing-extensions==4.3.0
-urllib3==1.26.9
-Werkzeug==2.1.2
-zipp==3.8.0
-pynacl
+tqdm>=4.64.0
+transformers>=4.30.0
 wandb
 aim
diff --git a/semilearn/nets/vit/vit.py b/semilearn/nets/vit/vit.py
index 84b5aa327..4044f1f8b 100644
--- a/semilearn/nets/vit/vit.py
+++ b/semilearn/nets/vit/vit.py
@@ -1,24 +1,27 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import math
 from functools import partial
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 import torch.utils.checkpoint
-
-from timm.models.layers import DropPath, trunc_normal_
-from timm.models.layers.helpers import to_2tuple
-
 from semilearn.nets.utils import load_checkpoint
+from timm.models.layers import DropPath, to_2tuple
 
 
 class PatchEmbed(nn.Module):
-    """ 2D Image to Patch Embedding
-    """
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+    """2D Image to Patch Embedding"""
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+    ):
         super().__init__()
         img_size = to_2tuple(img_size)
         patch_size = to_2tuple(patch_size)
@@ -28,13 +31,12 @@ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.flatten = flatten
 
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
 
     def forward(self, x):
-        B, C, H, W = x.shape
-        # assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
-        # assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
         x = self.proj(x)
         if self.flatten:
             x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
@@ -43,9 +45,16 @@ def forward(self, x):
 
 
 class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
-    """
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
@@ -67,12 +76,12 @@ def forward(self, x):
 
 
 class Attention(nn.Module):
-    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0):
         super().__init__()
-        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
         self.num_heads = num_heads
         head_dim = dim // num_heads
-        self.scale = head_dim ** -0.5
+        self.scale = head_dim**-0.5
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.attn_drop = nn.Dropout(attn_drop)
@@ -81,8 +90,12 @@ def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.)
 
     def forward(self, x):
         B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
 
         attn = (q @ k.transpose(-2, -1)) * self.scale
         attn = attn.softmax(dim=-1)
@@ -105,22 +118,47 @@ def forward(self, x):
 
 
 class Block(nn.Module):
-
     def __init__(
-            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., init_values=None,
-            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        init_values=None,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
         super().__init__()
         self.norm1 = norm_layer(dim)
-        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
-        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than
+        # dropout here
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
 
         self.norm2 = norm_layer(dim)
         mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
-        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
 
     def forward(self, x):
         x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
@@ -128,31 +166,48 @@ def forward(self, x):
         return x
 
 
-
 class VisionTransformer(nn.Module):
-    """ Vision Transformer
-    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
-        - https://arxiv.org/abs/2010.11929
+    """Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image
+    Recognition at Scale` (https://arxiv.org/abs/2010.11929)
     """
 
     def __init__(
-            self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token',
-            embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True,
-            drop_rate=0., attn_drop_rate=0., drop_path_rate=0., init_values=None,
-            embed_layer=PatchEmbed, norm_layer=None, act_layer=None, block_fn=Block):
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        global_pool="token",
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        init_values=None,
+        embed_layer=PatchEmbed,
+        norm_layer=None,
+        act_layer=None,
+        block_fn=Block,
+    ):
         """
         Args:
             img_size (int, tuple): input image size
             patch_size (int, tuple): patch size
             in_chans (int): number of input channels
             num_classes (int): number of classes for classification head
-            global_pool (str): type of global pooling for final sequence (default: 'token')
+            global_pool (str): type of global pooling for final sequence (default:
+                'token')
             embed_dim (int): embedding dimension
             depth (int): depth of transformer
             num_heads (int): number of attention heads
             mlp_ratio (int): ratio of mlp hidden dim to embedding dim
             qkv_bias (bool): enable bias for qkv if True
-            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            representation_size (Optional[int]): enable and set representation layer
+                (pre-logits) to this value if set
             drop_rate (float): dropout rate
             attn_drop_rate (float): attention dropout rate
             drop_path_rate (float): stochastic depth rate
@@ -163,39 +218,61 @@ def __init__(
             act_layer: (nn.Module): MLP activation layer
         """
         super().__init__()
-        assert global_pool in ('', 'avg', 'token')
+        assert global_pool in ("", "avg", "token")
         norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
         act_layer = act_layer or nn.GELU
 
         self.num_classes = num_classes
         self.global_pool = global_pool
-        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
         self.num_tokens = 1
         self.grad_checkpointing = False
 
         self.patch_embed = embed_layer(
-            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
         num_patches = self.patch_embed.num_patches
 
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + self.num_tokens, embed_dim)
+        )
         self.pos_drop = nn.Dropout(p=drop_rate)
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
-        self.blocks = nn.Sequential(*[
-            block_fn(
-                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, init_values=init_values,
-                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer)
-            for i in range(depth)])
-        use_fc_norm = self.global_pool == 'avg'
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    init_values=init_values,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        use_fc_norm = self.global_pool == "avg"
         self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
 
         # Classifier Head
         self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
         self.num_features = self.embed_dim
-        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-
-
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
 
     def extract(self, x):
         x = self.patch_embed(x)
@@ -205,7 +282,6 @@ def extract(self, x):
         x = self.norm(x)
         return x
 
-
     def forward(self, x, only_fc=False, only_feat=False, **kwargs):
         """
         Args:
@@ -213,35 +289,48 @@ def forward(self, x, only_fc=False, only_feat=False, **kwargs):
             only_fc: only use classifier, input should be features before classifier
             only_feat: only return pooled features
         """
-        
+
         if only_fc:
             return self.head(x)
-        
+
         x = self.extract(x)
         if self.global_pool:
-            x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
+            x = x[:, 1:].mean(dim=1) if self.global_pool == "avg" else x[:, 0]
         x = self.fc_norm(x)
 
         if only_feat:
             return x
 
         output = self.head(x)
-        result_dict = {'logits':output, 'feat':x}
+        result_dict = {"logits": output, "feat": x}
         return result_dict
 
     def no_weight_decay(self):
-        return {'pos_embed', 'cls_token'}
-    
-    def group_matcher(self, coarse=False, prefix=''):
+        return {"pos_embed", "cls_token"}
+
+    def group_matcher(self, coarse=False, prefix=""):
         return dict(
-            stem=r'^{}cls_token|{}pos_embed|{}patch_embed'.format(prefix, prefix, prefix),  # stem and embed
-            blocks=[(r'^{}blocks\.(\d+)'.format(prefix), None), (r'^{}norm'.format(prefix), (99999,))]
+            stem=r"^{}cls_token|{}pos_embed|{}patch_embed".format(
+                prefix, prefix, prefix
+            ),  # stem and embed
+            blocks=[
+                (r"^{}blocks\.(\d+)".format(prefix), None),
+                (r"^{}norm".format(prefix), (99999,)),
+            ],
         )
 
+
 def vit_tiny_patch2_32(pretrained=False, pretrained_path=None, **kwargs):
-    """ ViT-Tiny (Vit-Ti/2)
-    """
-    model_kwargs = dict(img_size=32, patch_size=2, embed_dim=192, depth=12, num_heads=3, drop_path_rate=0.1, **kwargs)
+    """ViT-Tiny (Vit-Ti/2)"""
+    model_kwargs = dict(
+        img_size=32,
+        patch_size=2,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        drop_path_rate=0.1,
+        **kwargs
+    )
     model = VisionTransformer(**model_kwargs)
     if pretrained:
         model = load_checkpoint(model, pretrained_path)
@@ -250,9 +339,16 @@ def vit_tiny_patch2_32(pretrained=False, pretrained_path=None, **kwargs):
 
 
 def vit_small_patch2_32(pretrained=False, pretrained_path=None, **kwargs):
-    """ ViT-Small (ViT-S/2)
-    """
-    model_kwargs = dict(img_size=32, patch_size=2, embed_dim=384, depth=12, num_heads=6, drop_path_rate=0.2, **kwargs)
+    """ViT-Small (ViT-S/2)"""
+    model_kwargs = dict(
+        img_size=32,
+        patch_size=2,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        drop_path_rate=0.2,
+        **kwargs
+    )
     model = VisionTransformer(**model_kwargs)
     if pretrained:
         model = load_checkpoint(model, pretrained_path)
@@ -260,32 +356,53 @@ def vit_small_patch2_32(pretrained=False, pretrained_path=None, **kwargs):
 
 
 def vit_small_patch16_224(pretrained=False, pretrained_path=None, **kwargs):
-    """ ViT-Small (ViT-S/16)
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, drop_path_rate=0.2, **kwargs)
+    """ViT-Small (ViT-S/16)"""
+    model_kwargs = dict(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        drop_path_rate=0.2,
+        **kwargs
+    )
     model = VisionTransformer(**model_kwargs)
     if pretrained:
-        model = load_checkpoint(model, pretrained_path)    
+        model = load_checkpoint(model, pretrained_path)
     return model
 
 
 def vit_base_patch16_96(pretrained=False, pretrained_path=None, **kwargs):
-    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.  # noqa: E501
     """
-    model_kwargs = dict(img_size=96, patch_size=16, embed_dim=768, depth=12, num_heads=12, drop_path_rate=0.2, **kwargs)
+    model_kwargs = dict(
+        img_size=96,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        drop_path_rate=0.2,
+        **kwargs
+    )
     model = VisionTransformer(**model_kwargs)
     if pretrained:
-        model = load_checkpoint(model, pretrained_path)   
+        model = load_checkpoint(model, pretrained_path)
     return model
 
 
 def vit_base_patch16_224(pretrained=False, pretrained_path=None, **kwargs):
-    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.  # noqa: E501
     """
-    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, drop_path_rate=0.2, **kwargs)
+    model_kwargs = dict(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        drop_path_rate=0.2,
+        **kwargs
+    )
     model = VisionTransformer(**model_kwargs)
     if pretrained:
-        model = load_checkpoint(model, pretrained_path)   
-    return model
\ No newline at end of file
+        model = load_checkpoint(model, pretrained_path)
+    return model