From eacb4ed948db3d59dda689efd884cfb5d7ce0d31 Mon Sep 17 00:00:00 2001 From: Sepehr Sameni Date: Tue, 18 Jul 2023 15:24:50 -0700 Subject: [PATCH 01/11] fix typo in description of args (#562) --- src/training/params.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/training/params.py b/src/training/params.py index 31c841791..c33312f80 100644 --- a/src/training/params.py +++ b/src/training/params.py @@ -370,13 +370,13 @@ def parse_args(args): "--lock-text-unlocked-layers", type=int, default=0, - help="Leave last n image tower layer groups unlocked.", + help="Leave last n text tower layer groups unlocked.", ) parser.add_argument( "--lock-text-freeze-layer-norm", default=False, action='store_true', - help="Freeze BatchNorm running stats in image tower for any locked layers.", + help="Freeze BatchNorm running stats in text tower for any locked layers.", ) parser.add_argument( "--log-every-n-steps", From 24ddefb37fc4892f6a0c975b732226fe8a9a8613 Mon Sep 17 00:00:00 2001 From: Giovanni Puccetti Date: Wed, 19 Jul 2023 00:25:32 +0200 Subject: [PATCH 02/11] change clip loss (#561) --- src/open_clip/loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/open_clip/loss.py b/src/open_clip/loss.py index 3a8bfb901..0dd048935 100644 --- a/src/open_clip/loss.py +++ b/src/open_clip/loss.py @@ -159,7 +159,7 @@ def __init__( def forward(self, image_features, text_features, logits, labels, logit_scale, output_dict=False): - clip_loss = 0 + clip_loss = torch.tensor(0) if self.clip_loss_weight: clip_loss = super().forward(image_features, text_features, logit_scale) From 67e5e5ec8741281eb9b30f640c26f91c666308b7 Mon Sep 17 00:00:00 2001 From: Jason Chou Date: Thu, 27 Jul 2023 14:15:49 -0700 Subject: [PATCH 03/11] Fix typo: "close openclip" -> "clone openclip" (#582) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e4ac9955..aa3c975b5 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ You can then install openclip for training with `pip install 'open_clip_torch[tr #### Development -If you want to make changes to contribute code, you can close openclip then run `make install` in openclip folder (after creating a virtualenv) +If you want to make changes to contribute code, you can clone openclip then run `make install` in openclip folder (after creating a virtualenv) Install pip PyTorch as per https://pytorch.org/get-started/locally/ From 2f55cd939695c341f3785ae45299cb53fba10a65 Mon Sep 17 00:00:00 2001 From: Jason Chou Date: Tue, 8 Aug 2023 16:44:29 -0700 Subject: [PATCH 04/11] Fix malformed `--train-data` argument example (#592) Unbalanced quotation marks --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index aa3c975b5..f7d18739c 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ In the paper, they also finetuned without the patch dropout at the end. You can #### Multiple data sources OpenCLIP supports using multiple data sources, by separating different data paths with `::`. -For instance, to train on CC12M and on LAION, one might use `--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar'::/data/LAION-400M/{00000..41455}.tar"`. +For instance, to train on CC12M and on LAION, one might use `--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar'::'/data/LAION-400M/{00000..41455}.tar'`. Using `--dataset-resampled` is recommended for these cases. By default, on expectation the amount of times the model will see a sample from each source is proportional to the size of the source. From f190703d847b7b234dbeb8265d7a69d7e7e4e996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Ilharco=20Magalh=C3=A3es?= Date: Tue, 8 Aug 2023 17:24:14 -0700 Subject: [PATCH 05/11] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f7d18739c..0d14bd079 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ In the paper, they also finetuned without the patch dropout at the end. You can #### Multiple data sources OpenCLIP supports using multiple data sources, by separating different data paths with `::`. -For instance, to train on CC12M and on LAION, one might use `--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar'::'/data/LAION-400M/{00000..41455}.tar'`. +For instance, to train on CC12M and on LAION, one might use `--train-data "/data/cc12m/cc12m-train-{0000..2175}.tar::/data/LAION-400M/{00000..41455}.tar"`. Using `--dataset-resampled` is recommended for these cases. By default, on expectation the amount of times the model will see a sample from each source is proportional to the size of the source. From ab1ae01731885ef2436a40e3c8c147347f18153e Mon Sep 17 00:00:00 2001 From: Chris Wendler Date: Sat, 26 Aug 2023 18:45:31 +0200 Subject: [PATCH 06/11] Update README.md (#607) make --dataset-resampled flag visually more prominent --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0d14bd079..c25c07db2 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ numerical results as the naïve method. #### Epochs -For larger datasets (eg Laion2B), we recommend setting --train-num-samples to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1/16 of an epoch in conjunction with --dataset-resampled to do sampling with replacement. This allows having frequent checkpoints to evaluate more often. +For larger datasets (eg Laion2B), we recommend setting `--train-num-samples` to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1/16 of an epoch in conjunction with `--dataset-resampled` to do sampling with replacement. This allows having frequent checkpoints to evaluate more often. #### Patch Dropout From 8556945f09cd149cbef69f8394308d8b41dca596 Mon Sep 17 00:00:00 2001 From: Jason Chou Date: Mon, 28 Aug 2023 12:31:04 -0700 Subject: [PATCH 07/11] Fix `text.transformer.embeddings.position_ids` key error (#595) * fix create_model & test * better fix: explained & strict --- src/open_clip/factory.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py index ac8596eab..72a4e4d18 100644 --- a/src/open_clip/factory.py +++ b/src/open_clip/factory.py @@ -100,6 +100,10 @@ def load_checkpoint(model, checkpoint_path, strict=True): # detect old format and make compatible with new format if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'): state_dict = convert_to_custom_text_state_dict(state_dict) + # Certain text transformers no longer expect position_ids after transformers==4.31 + position_id_key = 'text.transformer.embeddings.position_ids' + if position_id_key in state_dict and not hasattr(model, position_id_key): + del state_dict[position_id_key] resize_pos_embed(state_dict, model) incompatible_keys = model.load_state_dict(state_dict, strict=strict) return incompatible_keys From 579b6a9a703f2d1761031ffd8d8e1e012920d7da Mon Sep 17 00:00:00 2001 From: Jason Chou Date: Mon, 28 Aug 2023 12:44:25 -0700 Subject: [PATCH 08/11] Fix typos: "currenlty" -> "currently", "your" -> "you're" (#583) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Gabriel Ilharco Magalhães --- src/training/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/main.py b/src/training/main.py index 2929d0121..4f2172808 100644 --- a/src/training/main.py +++ b/src/training/main.py @@ -232,7 +232,7 @@ def main(args): output_dict=True, ) if args.distill: - # FIXME: currenlty assumes the model your distilling from has the same tokenizer & transforms. + # FIXME: currently assumes the model you're distilling from has the same tokenizer & transforms. dist_model, _, _ = create_model_and_transforms( args.distill_model, args.distill_pretrained, From c22a8ecaf95ace2e1ac785e3384689c03754bd40 Mon Sep 17 00:00:00 2001 From: Zi-Yuan Hu <43438692+HenryHZY@users.noreply.github.com> Date: Tue, 29 Aug 2023 03:59:33 +0800 Subject: [PATCH 09/11] Update README.md (#593) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update open_clip.list_pretrained() Co-authored-by: Gabriel Ilharco Magalhães --- README.md | 167 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 87 insertions(+), 80 deletions(-) diff --git a/README.md b/README.md index c25c07db2..9da04585d 100644 --- a/README.md +++ b/README.md @@ -591,86 +591,93 @@ Future trained models will use nn.GELU. ```python >>> import open_clip >>> open_clip.list_pretrained() -[('RN50', 'openai'), -('RN50', 'yfcc15m'), -('RN50', 'cc12m'), -('RN50-quickgelu', 'openai'), -('RN50-quickgelu', 'yfcc15m'), -('RN50-quickgelu', 'cc12m'), -('RN101', 'openai'), -('RN101', 'yfcc15m'), -('RN101-quickgelu', 'openai'), -('RN101-quickgelu', 'yfcc15m'), -('RN50x4', 'openai'), -('RN50x16', 'openai'), -('RN50x64', 'openai'), -('ViT-B-32', 'openai'), -('ViT-B-32', 'laion400m_e31'), -('ViT-B-32', 'laion400m_e32'), -('ViT-B-32', 'laion2b_e16'), -('ViT-B-32', 'laion2b_s34b_b79k'), -('ViT-B-32', 'datacomp_m_s128m_b4k'), -('ViT-B-32', 'commonpool_m_clip_s128m_b4k'), -('ViT-B-32', 'commonpool_m_laion_s128m_b4k'), -('ViT-B-32', 'commonpool_m_image_s128m_b4k'), -('ViT-B-32', 'commonpool_m_text_s128m_b4k'), -('ViT-B-32', 'commonpool_m_basic_s128m_b4k'), -('ViT-B-32', 'commonpool_m_s128m_b4k'), -('ViT-B-32', 'datacomp_s_s13m_b4k'), -('ViT-B-32', 'commonpool_s_clip_s13m_b4k'), -('ViT-B-32', 'commonpool_s_laion_s13m_b4k'), -('ViT-B-32', 'commonpool_s_image_s13m_b4k'), -('ViT-B-32', 'commonpool_s_text_s13m_b4k'), -('ViT-B-32', 'commonpool_s_basic_s13m_b4k'), -('ViT-B-32', 'commonpool_s_s13m_b4k'), -('ViT-B-32-quickgelu', 'openai'), -('ViT-B-32-quickgelu', 'laion400m_e31'), -('ViT-B-32-quickgelu', 'laion400m_e32'), -('ViT-B-16', 'openai'), -('ViT-B-16', 'laion400m_e31'), -('ViT-B-16', 'laion400m_e32'), -('ViT-B-16', 'laion2b_s34b_b88k'), -('ViT-B-16', 'datacomp_l_s1b_b8k'), -('ViT-B-16', 'commonpool_l_clip_s1b_b8k'), -('ViT-B-16', 'commonpool_l_laion_s1b_b8k'), -('ViT-B-16', 'commonpool_l_image_s1b_b8k'), -('ViT-B-16', 'commonpool_l_text_s1b_b8k'), -('ViT-B-16', 'commonpool_l_basic_s1b_b8k'), -('ViT-B-16', 'commonpool_l_s1b_b8k'), -('ViT-B-16-plus-240', 'laion400m_e31'), -('ViT-B-16-plus-240', 'laion400m_e32'), -('ViT-L-14', 'openai'), -('ViT-L-14', 'laion400m_e31'), -('ViT-L-14', 'laion400m_e32'), -('ViT-L-14', 'laion2b_s32b_b82k'), -('ViT-L-14', 'datacomp_xl_s13b_b90k'), -('ViT-L-14', 'commonpool_xl_clip_s13b_b90k'), -('ViT-L-14', 'commonpool_xl_laion_s13b_b90k'), -('ViT-L-14', 'commonpool_xl_s13b_b90k'), -('ViT-L-14-336', 'openai'), -('ViT-H-14', 'laion2b_s32b_b79k'), -('ViT-g-14', 'laion2b_s12b_b42k'), -('ViT-g-14', 'laion2b_s34b_b88k'), -('ViT-bigG-14', 'laion2b_s39b_b160k'), -('roberta-ViT-B-32', 'laion2b_s12b_b32k'), -('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'), -('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'), -('convnext_base', 'laion400m_s13b_b51k'), -('convnext_base_w', 'laion2b_s13b_b82k'), -('convnext_base_w', 'laion2b_s13b_b82k_augreg'), -('convnext_base_w', 'laion_aesthetic_s13b_b82k'), -('convnext_base_w_320', 'laion_aesthetic_s13b_b82k'), -('convnext_base_w_320', 'laion_aesthetic_s13b_b82k_augreg'), -('convnext_large_d', 'laion2b_s26b_b102k_augreg'), -('convnext_large_d_320', 'laion2b_s29b_b131k_ft'), -('convnext_large_d_320', 'laion2b_s29b_b131k_ft_soup'), -('convnext_xxlarge', 'laion2b_s34b_b82k_augreg'), -('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_rewind'), -('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_soup'), -('coca_ViT-B-32', 'laion2b_s13b_b90k'), -('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'), -('coca_ViT-L-14', 'laion2b_s13b_b90k'), -('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k') +[('RN50', 'openai'), +('RN50', 'yfcc15m'), +('RN50', 'cc12m'), +('RN50-quickgelu', 'openai'), +('RN50-quickgelu', 'yfcc15m'), +('RN50-quickgelu', 'cc12m'), +('RN101', 'openai'), +('RN101', 'yfcc15m'), +('RN101-quickgelu', 'openai'), +('RN101-quickgelu', 'yfcc15m'), +('RN50x4', 'openai'), +('RN50x16', 'openai'), +('RN50x64', 'openai'), +('ViT-B-32', 'openai'), +('ViT-B-32', 'laion400m_e31'), +('ViT-B-32', 'laion400m_e32'), +('ViT-B-32', 'laion2b_e16'), +('ViT-B-32', 'laion2b_s34b_b79k'), +('ViT-B-32', 'datacomp_m_s128m_b4k'), +('ViT-B-32', 'commonpool_m_clip_s128m_b4k'), +('ViT-B-32', 'commonpool_m_laion_s128m_b4k'), +('ViT-B-32', 'commonpool_m_image_s128m_b4k'), +('ViT-B-32', 'commonpool_m_text_s128m_b4k'), +('ViT-B-32', 'commonpool_m_basic_s128m_b4k'), +('ViT-B-32', 'commonpool_m_s128m_b4k'), +('ViT-B-32', 'datacomp_s_s13m_b4k'), +('ViT-B-32', 'commonpool_s_clip_s13m_b4k'), +('ViT-B-32', 'commonpool_s_laion_s13m_b4k'), +('ViT-B-32', 'commonpool_s_image_s13m_b4k'), +('ViT-B-32', 'commonpool_s_text_s13m_b4k'), +('ViT-B-32', 'commonpool_s_basic_s13m_b4k'), +('ViT-B-32', 'commonpool_s_s13m_b4k'), +('ViT-B-32-quickgelu', 'openai'), +('ViT-B-32-quickgelu', 'laion400m_e31'), +('ViT-B-32-quickgelu', 'laion400m_e32'), +('ViT-B-16', 'openai'), +('ViT-B-16', 'laion400m_e31'), +('ViT-B-16', 'laion400m_e32'), +('ViT-B-16', 'laion2b_s34b_b88k'), +('ViT-B-16', 'datacomp_l_s1b_b8k'), +('ViT-B-16', 'commonpool_l_clip_s1b_b8k'), +('ViT-B-16', 'commonpool_l_laion_s1b_b8k'), +('ViT-B-16', 'commonpool_l_image_s1b_b8k'), +('ViT-B-16', 'commonpool_l_text_s1b_b8k'), +('ViT-B-16', 'commonpool_l_basic_s1b_b8k'), +('ViT-B-16', 'commonpool_l_s1b_b8k'), +('ViT-B-16-plus-240', 'laion400m_e31'), +('ViT-B-16-plus-240', 'laion400m_e32'), +('ViT-L-14', 'openai'), +('ViT-L-14', 'laion400m_e31'), +('ViT-L-14', 'laion400m_e32'), +('ViT-L-14', 'laion2b_s32b_b82k'), +('ViT-L-14', 'datacomp_xl_s13b_b90k'), +('ViT-L-14', 'commonpool_xl_clip_s13b_b90k'), +('ViT-L-14', 'commonpool_xl_laion_s13b_b90k'), +('ViT-L-14', 'commonpool_xl_s13b_b90k'), +('ViT-L-14-336', 'openai'), +('ViT-H-14', 'laion2b_s32b_b79k'), +('ViT-g-14', 'laion2b_s12b_b42k'), +('ViT-g-14', 'laion2b_s34b_b88k'), +('ViT-bigG-14', 'laion2b_s39b_b160k'), +('roberta-ViT-B-32', 'laion2b_s12b_b32k'), +('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'), +('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'), +('convnext_base', 'laion400m_s13b_b51k'), +('convnext_base_w', 'laion2b_s13b_b82k'), +('convnext_base_w', 'laion2b_s13b_b82k_augreg'), +('convnext_base_w', 'laion_aesthetic_s13b_b82k'), +('convnext_base_w_320', 'laion_aesthetic_s13b_b82k'), +('convnext_base_w_320', 'laion_aesthetic_s13b_b82k_augreg'), +('convnext_large_d', 'laion2b_s26b_b102k_augreg'), +('convnext_large_d_320', 'laion2b_s29b_b131k_ft'), +('convnext_large_d_320', 'laion2b_s29b_b131k_ft_soup'), +('convnext_xxlarge', 'laion2b_s34b_b82k_augreg'), +('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_rewind'), +('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_soup'), +('coca_ViT-B-32', 'laion2b_s13b_b90k'), +('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'), +('coca_ViT-L-14', 'laion2b_s13b_b90k'), +('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k'), +('EVA01-g-14', 'laion400m_s11b_b41k'), +('EVA01-g-14-plus', 'merged2b_s11b_b114k'), +('EVA02-B-16', 'merged2b_s8b_b131k'), +('EVA02-L-14', 'merged2b_s4b_b131k'), +('EVA02-L-14-336', 'merged2b_s6b_b61k'), +('EVA02-E-14', 'laion2b_s4b_b115k'), +('EVA02-E-14-plus', 'laion2b_s9b_b144k') ] >>> model, train_transform, eval_transform = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k') From 5f7892b672b21e6853d0f6c11b18dda9bcf36c8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Ilharco=20Magalh=C3=A3es?= Date: Wed, 30 Aug 2023 13:07:22 -0700 Subject: [PATCH 10/11] Update README.md --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 9da04585d..4635b39b9 100644 --- a/README.md +++ b/README.md @@ -813,6 +813,16 @@ If you found this repository useful, please consider citing: } ``` +```bibtex +@inproceedings{cherti2023reproducible, + title={Reproducible scaling laws for contrastive language-image learning}, + author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2818--2829}, + year={2023} +} +``` + ```bibtex @inproceedings{Radford2021LearningTV, title={Learning Transferable Visual Models From Natural Language Supervision}, From 79a20ee9e13d612b44bebe1581aa83cf467c281b Mon Sep 17 00:00:00 2001 From: Jason Chou Date: Tue, 5 Sep 2023 12:44:05 -0700 Subject: [PATCH 11/11] `doest not` -> `does not` (#621) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4635b39b9..4922e47b8 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ python -m training.main \ ``` Note: `imagenet-val` is the path to the *validation* set of ImageNet for zero-shot evaluation, not the training set! -You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it doest not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh). +You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it does not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh). ### Multi-GPU and Beyond