From eacb4ed948db3d59dda689efd884cfb5d7ce0d31 Mon Sep 17 00:00:00 2001
From: Sepehr Sameni <Sepehr.Sameni@gmail.com>
Date: Tue, 18 Jul 2023 15:24:50 -0700
Subject: [PATCH 01/11] fix typo in description of args (#562)

---
 src/training/params.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/training/params.py b/src/training/params.py
index 31c841791..c33312f80 100644
--- a/src/training/params.py
+++ b/src/training/params.py
@@ -370,13 +370,13 @@ def parse_args(args):
         "--lock-text-unlocked-layers",
         type=int,
         default=0,
-        help="Leave last n image tower layer groups unlocked.",
+        help="Leave last n text tower layer groups unlocked.",
     )
     parser.add_argument(
         "--lock-text-freeze-layer-norm",
         default=False,
         action='store_true',
-        help="Freeze BatchNorm running stats in image tower for any locked layers.",
+        help="Freeze BatchNorm running stats in text tower for any locked layers.",
     )
     parser.add_argument(
         "--log-every-n-steps",

From 24ddefb37fc4892f6a0c975b732226fe8a9a8613 Mon Sep 17 00:00:00 2001
From: Giovanni Puccetti <g.puccetti92@gmail.com>
Date: Wed, 19 Jul 2023 00:25:32 +0200
Subject: [PATCH 02/11] change clip loss (#561)

---
 src/open_clip/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/open_clip/loss.py b/src/open_clip/loss.py
index 3a8bfb901..0dd048935 100644
--- a/src/open_clip/loss.py
+++ b/src/open_clip/loss.py
@@ -159,7 +159,7 @@ def __init__(
 
     def forward(self, image_features, text_features, logits, labels, logit_scale, output_dict=False):
         
-        clip_loss = 0
+        clip_loss = torch.tensor(0)
         
         if self.clip_loss_weight:
             clip_loss = super().forward(image_features, text_features, logit_scale)

From 67e5e5ec8741281eb9b30f640c26f91c666308b7 Mon Sep 17 00:00:00 2001
From: Jason Chou <chuanchih@gmail.com>
Date: Thu, 27 Jul 2023 14:15:49 -0700
Subject: [PATCH 03/11] Fix typo: "close openclip" -> "clone openclip" (#582)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2e4ac9955..aa3c975b5 100644
--- a/README.md
+++ b/README.md
@@ -121,7 +121,7 @@ You can then install openclip for training with `pip install 'open_clip_torch[tr
 
 #### Development
 
-If you want to make changes to contribute code, you can close openclip then run `make install` in openclip folder (after creating a virtualenv)
+If you want to make changes to contribute code, you can clone openclip then run `make install` in openclip folder (after creating a virtualenv)
 
 Install pip PyTorch as per https://pytorch.org/get-started/locally/
 

From 2f55cd939695c341f3785ae45299cb53fba10a65 Mon Sep 17 00:00:00 2001
From: Jason Chou <chuanchih@gmail.com>
Date: Tue, 8 Aug 2023 16:44:29 -0700
Subject: [PATCH 04/11] Fix malformed `--train-data` argument example (#592)

Unbalanced quotation marks
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index aa3c975b5..f7d18739c 100644
--- a/README.md
+++ b/README.md
@@ -196,7 +196,7 @@ In the paper, they also finetuned without the patch dropout at the end. You can
 #### Multiple data sources
 
 OpenCLIP supports using multiple data sources, by separating different data paths with `::`.
-For instance, to train on CC12M and on LAION, one might use `--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar'::/data/LAION-400M/{00000..41455}.tar"`.
+For instance, to train on CC12M and on LAION, one might use `--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar'::'/data/LAION-400M/{00000..41455}.tar'`.
 Using `--dataset-resampled` is recommended for these cases.
 
 By default, on expectation the amount of times the model will see a sample from each source is proportional to the size of the source.

From f190703d847b7b234dbeb8265d7a69d7e7e4e996 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabriel=20Ilharco=20Magalh=C3=A3es?=
 <gabrielilharco@users.noreply.github.com>
Date: Tue, 8 Aug 2023 17:24:14 -0700
Subject: [PATCH 05/11] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f7d18739c..0d14bd079 100644
--- a/README.md
+++ b/README.md
@@ -196,7 +196,7 @@ In the paper, they also finetuned without the patch dropout at the end. You can
 #### Multiple data sources
 
 OpenCLIP supports using multiple data sources, by separating different data paths with `::`.
-For instance, to train on CC12M and on LAION, one might use `--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar'::'/data/LAION-400M/{00000..41455}.tar'`.
+For instance, to train on CC12M and on LAION, one might use `--train-data "/data/cc12m/cc12m-train-{0000..2175}.tar::/data/LAION-400M/{00000..41455}.tar"`.
 Using `--dataset-resampled` is recommended for these cases.
 
 By default, on expectation the amount of times the model will see a sample from each source is proportional to the size of the source.

From ab1ae01731885ef2436a40e3c8c147347f18153e Mon Sep 17 00:00:00 2001
From: Chris Wendler <chris.wendler.mobile@gmail.com>
Date: Sat, 26 Aug 2023 18:45:31 +0200
Subject: [PATCH 06/11] Update README.md (#607)

make --dataset-resampled flag visually more prominent
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0d14bd079..c25c07db2 100644
--- a/README.md
+++ b/README.md
@@ -183,7 +183,7 @@ numerical results as the naïve method.
 
 #### Epochs
 
-For larger datasets (eg Laion2B), we recommend setting --train-num-samples to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1/16 of an epoch in conjunction with --dataset-resampled to do sampling with replacement. This allows having frequent checkpoints to evaluate more often.
+For larger datasets (eg Laion2B), we recommend setting `--train-num-samples` to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1/16 of an epoch in conjunction with `--dataset-resampled` to do sampling with replacement. This allows having frequent checkpoints to evaluate more often.
 
 #### Patch Dropout
 

From 8556945f09cd149cbef69f8394308d8b41dca596 Mon Sep 17 00:00:00 2001
From: Jason Chou <chuanchih@gmail.com>
Date: Mon, 28 Aug 2023 12:31:04 -0700
Subject: [PATCH 07/11] Fix `text.transformer.embeddings.position_ids` key
 error (#595)

* fix create_model & test

* better fix: explained & strict
---
 src/open_clip/factory.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py
index ac8596eab..72a4e4d18 100644
--- a/src/open_clip/factory.py
+++ b/src/open_clip/factory.py
@@ -100,6 +100,10 @@ def load_checkpoint(model, checkpoint_path, strict=True):
     # detect old format and make compatible with new format
     if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
         state_dict = convert_to_custom_text_state_dict(state_dict)
+    # Certain text transformers no longer expect position_ids after transformers==4.31
+    position_id_key = 'text.transformer.embeddings.position_ids'
+    if position_id_key in state_dict and not hasattr(model, position_id_key):
+        del state_dict[position_id_key]
     resize_pos_embed(state_dict, model)
     incompatible_keys = model.load_state_dict(state_dict, strict=strict)
     return incompatible_keys

From 579b6a9a703f2d1761031ffd8d8e1e012920d7da Mon Sep 17 00:00:00 2001
From: Jason Chou <chuanchih@gmail.com>
Date: Mon, 28 Aug 2023 12:44:25 -0700
Subject: [PATCH 08/11] Fix typos: "currenlty" -> "currently", "your" ->
 "you're" (#583)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Gabriel Ilharco Magalhães <gabrielilharco@users.noreply.github.com>
---
 src/training/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/training/main.py b/src/training/main.py
index 2929d0121..4f2172808 100644
--- a/src/training/main.py
+++ b/src/training/main.py
@@ -232,7 +232,7 @@ def main(args):
         output_dict=True,
     )
     if args.distill:
-        # FIXME: currenlty assumes the model your distilling from has the same tokenizer & transforms.
+        # FIXME: currently assumes the model you're distilling from has the same tokenizer & transforms.
         dist_model, _, _ = create_model_and_transforms(
             args.distill_model, 
             args.distill_pretrained,

From c22a8ecaf95ace2e1ac785e3384689c03754bd40 Mon Sep 17 00:00:00 2001
From: Zi-Yuan Hu <43438692+HenryHZY@users.noreply.github.com>
Date: Tue, 29 Aug 2023 03:59:33 +0800
Subject: [PATCH 09/11] Update README.md (#593)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update open_clip.list_pretrained()

Co-authored-by: Gabriel Ilharco Magalhães <gabrielilharco@users.noreply.github.com>
---
 README.md | 167 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 87 insertions(+), 80 deletions(-)

diff --git a/README.md b/README.md
index c25c07db2..9da04585d 100644
--- a/README.md
+++ b/README.md
@@ -591,86 +591,93 @@ Future trained models will use nn.GELU.
 ```python
 >>> import open_clip
 >>> open_clip.list_pretrained()
-[('RN50', 'openai'),
-('RN50', 'yfcc15m'),
-('RN50', 'cc12m'),
-('RN50-quickgelu', 'openai'),
-('RN50-quickgelu', 'yfcc15m'),
-('RN50-quickgelu', 'cc12m'),
-('RN101', 'openai'),
-('RN101', 'yfcc15m'),
-('RN101-quickgelu', 'openai'),
-('RN101-quickgelu', 'yfcc15m'),
-('RN50x4', 'openai'),
-('RN50x16', 'openai'),
-('RN50x64', 'openai'),
-('ViT-B-32', 'openai'),
-('ViT-B-32', 'laion400m_e31'),
-('ViT-B-32', 'laion400m_e32'),
-('ViT-B-32', 'laion2b_e16'),
-('ViT-B-32', 'laion2b_s34b_b79k'),
-('ViT-B-32', 'datacomp_m_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_s128m_b4k'),
-('ViT-B-32', 'datacomp_s_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_image_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_text_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_basic_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_s13m_b4k'),
-('ViT-B-32-quickgelu', 'openai'),
-('ViT-B-32-quickgelu', 'laion400m_e31'),
-('ViT-B-32-quickgelu', 'laion400m_e32'),
-('ViT-B-16', 'openai'),
-('ViT-B-16', 'laion400m_e31'),
-('ViT-B-16', 'laion400m_e32'),
-('ViT-B-16', 'laion2b_s34b_b88k'),
-('ViT-B-16', 'datacomp_l_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_clip_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_laion_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_image_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_text_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_basic_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_s1b_b8k'),
-('ViT-B-16-plus-240', 'laion400m_e31'),
-('ViT-B-16-plus-240', 'laion400m_e32'),
-('ViT-L-14', 'openai'),
-('ViT-L-14', 'laion400m_e31'),
-('ViT-L-14', 'laion400m_e32'),
-('ViT-L-14', 'laion2b_s32b_b82k'),
-('ViT-L-14', 'datacomp_xl_s13b_b90k'),
-('ViT-L-14', 'commonpool_xl_clip_s13b_b90k'),
-('ViT-L-14', 'commonpool_xl_laion_s13b_b90k'),
-('ViT-L-14', 'commonpool_xl_s13b_b90k'),
-('ViT-L-14-336', 'openai'),
-('ViT-H-14', 'laion2b_s32b_b79k'),
-('ViT-g-14', 'laion2b_s12b_b42k'),
-('ViT-g-14', 'laion2b_s34b_b88k'),
-('ViT-bigG-14', 'laion2b_s39b_b160k'),
-('roberta-ViT-B-32', 'laion2b_s12b_b32k'),
-('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'),
-('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'),
-('convnext_base', 'laion400m_s13b_b51k'),
-('convnext_base_w', 'laion2b_s13b_b82k'),
-('convnext_base_w', 'laion2b_s13b_b82k_augreg'),
-('convnext_base_w', 'laion_aesthetic_s13b_b82k'),
-('convnext_base_w_320', 'laion_aesthetic_s13b_b82k'),
-('convnext_base_w_320', 'laion_aesthetic_s13b_b82k_augreg'),
-('convnext_large_d', 'laion2b_s26b_b102k_augreg'),
-('convnext_large_d_320', 'laion2b_s29b_b131k_ft'),
-('convnext_large_d_320', 'laion2b_s29b_b131k_ft_soup'),
-('convnext_xxlarge', 'laion2b_s34b_b82k_augreg'),
-('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_rewind'),
-('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_soup'),
-('coca_ViT-B-32', 'laion2b_s13b_b90k'),
-('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'),
-('coca_ViT-L-14', 'laion2b_s13b_b90k'),
-('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k')
+[('RN50', 'openai'), 
+('RN50', 'yfcc15m'), 
+('RN50', 'cc12m'), 
+('RN50-quickgelu', 'openai'), 
+('RN50-quickgelu', 'yfcc15m'), 
+('RN50-quickgelu', 'cc12m'), 
+('RN101', 'openai'), 
+('RN101', 'yfcc15m'), 
+('RN101-quickgelu', 'openai'), 
+('RN101-quickgelu', 'yfcc15m'), 
+('RN50x4', 'openai'), 
+('RN50x16', 'openai'), 
+('RN50x64', 'openai'), 
+('ViT-B-32', 'openai'), 
+('ViT-B-32', 'laion400m_e31'), 
+('ViT-B-32', 'laion400m_e32'), 
+('ViT-B-32', 'laion2b_e16'), 
+('ViT-B-32', 'laion2b_s34b_b79k'), 
+('ViT-B-32', 'datacomp_m_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_clip_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_laion_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_image_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_text_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_basic_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_s128m_b4k'), 
+('ViT-B-32', 'datacomp_s_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_clip_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_laion_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_image_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_text_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_basic_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_s13m_b4k'), 
+('ViT-B-32-quickgelu', 'openai'), 
+('ViT-B-32-quickgelu', 'laion400m_e31'), 
+('ViT-B-32-quickgelu', 'laion400m_e32'), 
+('ViT-B-16', 'openai'), 
+('ViT-B-16', 'laion400m_e31'), 
+('ViT-B-16', 'laion400m_e32'), 
+('ViT-B-16', 'laion2b_s34b_b88k'), 
+('ViT-B-16', 'datacomp_l_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_clip_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_laion_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_image_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_text_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_basic_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_s1b_b8k'), 
+('ViT-B-16-plus-240', 'laion400m_e31'), 
+('ViT-B-16-plus-240', 'laion400m_e32'), 
+('ViT-L-14', 'openai'), 
+('ViT-L-14', 'laion400m_e31'), 
+('ViT-L-14', 'laion400m_e32'), 
+('ViT-L-14', 'laion2b_s32b_b82k'), 
+('ViT-L-14', 'datacomp_xl_s13b_b90k'), 
+('ViT-L-14', 'commonpool_xl_clip_s13b_b90k'), 
+('ViT-L-14', 'commonpool_xl_laion_s13b_b90k'), 
+('ViT-L-14', 'commonpool_xl_s13b_b90k'), 
+('ViT-L-14-336', 'openai'), 
+('ViT-H-14', 'laion2b_s32b_b79k'), 
+('ViT-g-14', 'laion2b_s12b_b42k'), 
+('ViT-g-14', 'laion2b_s34b_b88k'), 
+('ViT-bigG-14', 'laion2b_s39b_b160k'), 
+('roberta-ViT-B-32', 'laion2b_s12b_b32k'), 
+('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'), 
+('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'), 
+('convnext_base', 'laion400m_s13b_b51k'), 
+('convnext_base_w', 'laion2b_s13b_b82k'), 
+('convnext_base_w', 'laion2b_s13b_b82k_augreg'), 
+('convnext_base_w', 'laion_aesthetic_s13b_b82k'), 
+('convnext_base_w_320', 'laion_aesthetic_s13b_b82k'), 
+('convnext_base_w_320', 'laion_aesthetic_s13b_b82k_augreg'), 
+('convnext_large_d', 'laion2b_s26b_b102k_augreg'), 
+('convnext_large_d_320', 'laion2b_s29b_b131k_ft'), 
+('convnext_large_d_320', 'laion2b_s29b_b131k_ft_soup'), 
+('convnext_xxlarge', 'laion2b_s34b_b82k_augreg'), 
+('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_rewind'), 
+('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_soup'), 
+('coca_ViT-B-32', 'laion2b_s13b_b90k'), 
+('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'), 
+('coca_ViT-L-14', 'laion2b_s13b_b90k'), 
+('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k'), 
+('EVA01-g-14', 'laion400m_s11b_b41k'), 
+('EVA01-g-14-plus', 'merged2b_s11b_b114k'), 
+('EVA02-B-16', 'merged2b_s8b_b131k'), 
+('EVA02-L-14', 'merged2b_s4b_b131k'), 
+('EVA02-L-14-336', 'merged2b_s6b_b61k'), 
+('EVA02-E-14', 'laion2b_s4b_b115k'), 
+('EVA02-E-14-plus', 'laion2b_s9b_b144k')
 ]
 
 >>> model, train_transform, eval_transform = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')

From 5f7892b672b21e6853d0f6c11b18dda9bcf36c8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gabriel=20Ilharco=20Magalh=C3=A3es?=
 <gabrielilharco@users.noreply.github.com>
Date: Wed, 30 Aug 2023 13:07:22 -0700
Subject: [PATCH 10/11] Update README.md

---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 9da04585d..4635b39b9 100644
--- a/README.md
+++ b/README.md
@@ -813,6 +813,16 @@ If you found this repository useful, please consider citing:
 }
 ```
 
+```bibtex
+@inproceedings{cherti2023reproducible,
+  title={Reproducible scaling laws for contrastive language-image learning},
+  author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2818--2829},
+  year={2023}
+}
+```
+
 ```bibtex
 @inproceedings{Radford2021LearningTV,
   title={Learning Transferable Visual Models From Natural Language Supervision},

From 79a20ee9e13d612b44bebe1581aa83cf467c281b Mon Sep 17 00:00:00 2001
From: Jason Chou <chuanchih@gmail.com>
Date: Tue, 5 Sep 2023 12:44:05 -0700
Subject: [PATCH 11/11] `doest not` -> `does not` (#621)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4635b39b9..4922e47b8 100644
--- a/README.md
+++ b/README.md
@@ -168,7 +168,7 @@ python -m training.main \
 ```
 
 Note: `imagenet-val` is the path to the *validation* set of ImageNet for zero-shot evaluation, not the training set!
-You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it doest not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh).
+You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it does not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh).
 
 ### Multi-GPU and Beyond