facebookresearch · willyfh · Feb 17, 2024
diff --git a/torchmultimodal/models/clip/image_encoder.py b/torchmultimodal/models/clip/image_encoder.py
@@ -235,7 +235,8 @@ class ResNetForCLIP(nn.Module):
     - The final pooling layer is a QKV attention instead of an average pool.
 
     Args:
-        layers (Tuple[int]):
+        layers (Tuple[int]): number of residual blocks in each stage.
+            of the ResNet architecture
         output_dim (int): dimension of output tensor
         heads (int): number of heads in the attention pooling layer
         input_resolution (int): resolution of image input to encoder

diff --git a/torchmultimodal/models/clip/text_encoder.py b/torchmultimodal/models/clip/text_encoder.py
@@ -21,7 +21,7 @@ class CLIPTextEncoder(nn.Module):
 
     Args:
         embedding_dim (int): Embedding dimension for text and image encoders projections.
-        context_length (int): Maximum sequence length for Transforer.
+        context_length (int): Maximum sequence length for Transformer.
         vocab_size (int): Vocab size.
         width (int): Embedding dimension for Transformer encoder.
         dim_feedforward (int): Dimension of the feedfoward networks.