diff --git a/README.md b/README.md index 2e4ac9955..4922e47b8 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ You can then install openclip for training with `pip install 'open_clip_torch[tr #### Development -If you want to make changes to contribute code, you can close openclip then run `make install` in openclip folder (after creating a virtualenv) +If you want to make changes to contribute code, you can clone openclip then run `make install` in openclip folder (after creating a virtualenv) Install pip PyTorch as per https://pytorch.org/get-started/locally/ @@ -168,7 +168,7 @@ python -m training.main \ ``` Note: `imagenet-val` is the path to the *validation* set of ImageNet for zero-shot evaluation, not the training set! -You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it doest not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh). +You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it does not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh). ### Multi-GPU and Beyond @@ -183,7 +183,7 @@ numerical results as the naïve method. #### Epochs -For larger datasets (eg Laion2B), we recommend setting --train-num-samples to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1/16 of an epoch in conjunction with --dataset-resampled to do sampling with replacement. This allows having frequent checkpoints to evaluate more often. +For larger datasets (eg Laion2B), we recommend setting `--train-num-samples` to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1/16 of an epoch in conjunction with `--dataset-resampled` to do sampling with replacement. This allows having frequent checkpoints to evaluate more often. #### Patch Dropout @@ -196,7 +196,7 @@ In the paper, they also finetuned without the patch dropout at the end. You can #### Multiple data sources OpenCLIP supports using multiple data sources, by separating different data paths with `::`. -For instance, to train on CC12M and on LAION, one might use `--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar'::/data/LAION-400M/{00000..41455}.tar"`. +For instance, to train on CC12M and on LAION, one might use `--train-data "/data/cc12m/cc12m-train-{0000..2175}.tar::/data/LAION-400M/{00000..41455}.tar"`. Using `--dataset-resampled` is recommended for these cases. By default, on expectation the amount of times the model will see a sample from each source is proportional to the size of the source. @@ -591,86 +591,93 @@ Future trained models will use nn.GELU. ```python >>> import open_clip >>> open_clip.list_pretrained() -[('RN50', 'openai'), -('RN50', 'yfcc15m'), -('RN50', 'cc12m'), -('RN50-quickgelu', 'openai'), -('RN50-quickgelu', 'yfcc15m'), -('RN50-quickgelu', 'cc12m'), -('RN101', 'openai'), -('RN101', 'yfcc15m'), -('RN101-quickgelu', 'openai'), -('RN101-quickgelu', 'yfcc15m'), -('RN50x4', 'openai'), -('RN50x16', 'openai'), -('RN50x64', 'openai'), -('ViT-B-32', 'openai'), -('ViT-B-32', 'laion400m_e31'), -('ViT-B-32', 'laion400m_e32'), -('ViT-B-32', 'laion2b_e16'), -('ViT-B-32', 'laion2b_s34b_b79k'), -('ViT-B-32', 'datacomp_m_s128m_b4k'), -('ViT-B-32', 'commonpool_m_clip_s128m_b4k'), -('ViT-B-32', 'commonpool_m_laion_s128m_b4k'), -('ViT-B-32', 'commonpool_m_image_s128m_b4k'), -('ViT-B-32', 'commonpool_m_text_s128m_b4k'), -('ViT-B-32', 'commonpool_m_basic_s128m_b4k'), -('ViT-B-32', 'commonpool_m_s128m_b4k'), -('ViT-B-32', 'datacomp_s_s13m_b4k'), -('ViT-B-32', 'commonpool_s_clip_s13m_b4k'), -('ViT-B-32', 'commonpool_s_laion_s13m_b4k'), -('ViT-B-32', 'commonpool_s_image_s13m_b4k'), -('ViT-B-32', 'commonpool_s_text_s13m_b4k'), -('ViT-B-32', 'commonpool_s_basic_s13m_b4k'), -('ViT-B-32', 'commonpool_s_s13m_b4k'), -('ViT-B-32-quickgelu', 'openai'), -('ViT-B-32-quickgelu', 'laion400m_e31'), -('ViT-B-32-quickgelu', 'laion400m_e32'), -('ViT-B-16', 'openai'), -('ViT-B-16', 'laion400m_e31'), -('ViT-B-16', 'laion400m_e32'), -('ViT-B-16', 'laion2b_s34b_b88k'), -('ViT-B-16', 'datacomp_l_s1b_b8k'), -('ViT-B-16', 'commonpool_l_clip_s1b_b8k'), -('ViT-B-16', 'commonpool_l_laion_s1b_b8k'), -('ViT-B-16', 'commonpool_l_image_s1b_b8k'), -('ViT-B-16', 'commonpool_l_text_s1b_b8k'), -('ViT-B-16', 'commonpool_l_basic_s1b_b8k'), -('ViT-B-16', 'commonpool_l_s1b_b8k'), -('ViT-B-16-plus-240', 'laion400m_e31'), -('ViT-B-16-plus-240', 'laion400m_e32'), -('ViT-L-14', 'openai'), -('ViT-L-14', 'laion400m_e31'), -('ViT-L-14', 'laion400m_e32'), -('ViT-L-14', 'laion2b_s32b_b82k'), -('ViT-L-14', 'datacomp_xl_s13b_b90k'), -('ViT-L-14', 'commonpool_xl_clip_s13b_b90k'), -('ViT-L-14', 'commonpool_xl_laion_s13b_b90k'), -('ViT-L-14', 'commonpool_xl_s13b_b90k'), -('ViT-L-14-336', 'openai'), -('ViT-H-14', 'laion2b_s32b_b79k'), -('ViT-g-14', 'laion2b_s12b_b42k'), -('ViT-g-14', 'laion2b_s34b_b88k'), -('ViT-bigG-14', 'laion2b_s39b_b160k'), -('roberta-ViT-B-32', 'laion2b_s12b_b32k'), -('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'), -('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'), -('convnext_base', 'laion400m_s13b_b51k'), -('convnext_base_w', 'laion2b_s13b_b82k'), -('convnext_base_w', 'laion2b_s13b_b82k_augreg'), -('convnext_base_w', 'laion_aesthetic_s13b_b82k'), -('convnext_base_w_320', 'laion_aesthetic_s13b_b82k'), -('convnext_base_w_320', 'laion_aesthetic_s13b_b82k_augreg'), -('convnext_large_d', 'laion2b_s26b_b102k_augreg'), -('convnext_large_d_320', 'laion2b_s29b_b131k_ft'), -('convnext_large_d_320', 'laion2b_s29b_b131k_ft_soup'), -('convnext_xxlarge', 'laion2b_s34b_b82k_augreg'), -('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_rewind'), -('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_soup'), -('coca_ViT-B-32', 'laion2b_s13b_b90k'), -('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'), -('coca_ViT-L-14', 'laion2b_s13b_b90k'), -('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k') +[('RN50', 'openai'), +('RN50', 'yfcc15m'), +('RN50', 'cc12m'), +('RN50-quickgelu', 'openai'), +('RN50-quickgelu', 'yfcc15m'), +('RN50-quickgelu', 'cc12m'), +('RN101', 'openai'), +('RN101', 'yfcc15m'), +('RN101-quickgelu', 'openai'), +('RN101-quickgelu', 'yfcc15m'), +('RN50x4', 'openai'), +('RN50x16', 'openai'), +('RN50x64', 'openai'), +('ViT-B-32', 'openai'), +('ViT-B-32', 'laion400m_e31'), +('ViT-B-32', 'laion400m_e32'), +('ViT-B-32', 'laion2b_e16'), +('ViT-B-32', 'laion2b_s34b_b79k'), +('ViT-B-32', 'datacomp_m_s128m_b4k'), +('ViT-B-32', 'commonpool_m_clip_s128m_b4k'), +('ViT-B-32', 'commonpool_m_laion_s128m_b4k'), +('ViT-B-32', 'commonpool_m_image_s128m_b4k'), +('ViT-B-32', 'commonpool_m_text_s128m_b4k'), +('ViT-B-32', 'commonpool_m_basic_s128m_b4k'), +('ViT-B-32', 'commonpool_m_s128m_b4k'), +('ViT-B-32', 'datacomp_s_s13m_b4k'), +('ViT-B-32', 'commonpool_s_clip_s13m_b4k'), +('ViT-B-32', 'commonpool_s_laion_s13m_b4k'), +('ViT-B-32', 'commonpool_s_image_s13m_b4k'), +('ViT-B-32', 'commonpool_s_text_s13m_b4k'), +('ViT-B-32', 'commonpool_s_basic_s13m_b4k'), +('ViT-B-32', 'commonpool_s_s13m_b4k'), +('ViT-B-32-quickgelu', 'openai'), +('ViT-B-32-quickgelu', 'laion400m_e31'), +('ViT-B-32-quickgelu', 'laion400m_e32'), +('ViT-B-16', 'openai'), +('ViT-B-16', 'laion400m_e31'), +('ViT-B-16', 'laion400m_e32'), +('ViT-B-16', 'laion2b_s34b_b88k'), +('ViT-B-16', 'datacomp_l_s1b_b8k'), +('ViT-B-16', 'commonpool_l_clip_s1b_b8k'), +('ViT-B-16', 'commonpool_l_laion_s1b_b8k'), +('ViT-B-16', 'commonpool_l_image_s1b_b8k'), +('ViT-B-16', 'commonpool_l_text_s1b_b8k'), +('ViT-B-16', 'commonpool_l_basic_s1b_b8k'), +('ViT-B-16', 'commonpool_l_s1b_b8k'), +('ViT-B-16-plus-240', 'laion400m_e31'), +('ViT-B-16-plus-240', 'laion400m_e32'), +('ViT-L-14', 'openai'), +('ViT-L-14', 'laion400m_e31'), +('ViT-L-14', 'laion400m_e32'), +('ViT-L-14', 'laion2b_s32b_b82k'), +('ViT-L-14', 'datacomp_xl_s13b_b90k'), +('ViT-L-14', 'commonpool_xl_clip_s13b_b90k'), +('ViT-L-14', 'commonpool_xl_laion_s13b_b90k'), +('ViT-L-14', 'commonpool_xl_s13b_b90k'), +('ViT-L-14-336', 'openai'), +('ViT-H-14', 'laion2b_s32b_b79k'), +('ViT-g-14', 'laion2b_s12b_b42k'), +('ViT-g-14', 'laion2b_s34b_b88k'), +('ViT-bigG-14', 'laion2b_s39b_b160k'), +('roberta-ViT-B-32', 'laion2b_s12b_b32k'), +('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'), +('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'), +('convnext_base', 'laion400m_s13b_b51k'), +('convnext_base_w', 'laion2b_s13b_b82k'), +('convnext_base_w', 'laion2b_s13b_b82k_augreg'), +('convnext_base_w', 'laion_aesthetic_s13b_b82k'), +('convnext_base_w_320', 'laion_aesthetic_s13b_b82k'), +('convnext_base_w_320', 'laion_aesthetic_s13b_b82k_augreg'), +('convnext_large_d', 'laion2b_s26b_b102k_augreg'), +('convnext_large_d_320', 'laion2b_s29b_b131k_ft'), +('convnext_large_d_320', 'laion2b_s29b_b131k_ft_soup'), +('convnext_xxlarge', 'laion2b_s34b_b82k_augreg'), +('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_rewind'), +('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_soup'), +('coca_ViT-B-32', 'laion2b_s13b_b90k'), +('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'), +('coca_ViT-L-14', 'laion2b_s13b_b90k'), +('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k'), +('EVA01-g-14', 'laion400m_s11b_b41k'), +('EVA01-g-14-plus', 'merged2b_s11b_b114k'), +('EVA02-B-16', 'merged2b_s8b_b131k'), +('EVA02-L-14', 'merged2b_s4b_b131k'), +('EVA02-L-14-336', 'merged2b_s6b_b61k'), +('EVA02-E-14', 'laion2b_s4b_b115k'), +('EVA02-E-14-plus', 'laion2b_s9b_b144k') ] >>> model, train_transform, eval_transform = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k') @@ -806,6 +813,16 @@ If you found this repository useful, please consider citing: } ``` +```bibtex +@inproceedings{cherti2023reproducible, + title={Reproducible scaling laws for contrastive language-image learning}, + author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2818--2829}, + year={2023} +} +``` + ```bibtex @inproceedings{Radford2021LearningTV, title={Learning Transferable Visual Models From Natural Language Supervision}, diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py index 9c141256b..8eb507576 100644 --- a/src/open_clip/factory.py +++ b/src/open_clip/factory.py @@ -102,6 +102,10 @@ def load_checkpoint(model, checkpoint_path, strict=True): # detect old format and make compatible with new format if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'): state_dict = convert_to_custom_text_state_dict(state_dict) + # Certain text transformers no longer expect position_ids after transformers==4.31 + position_id_key = 'text.transformer.embeddings.position_ids' + if position_id_key in state_dict and not hasattr(model, position_id_key): + del state_dict[position_id_key] resize_pos_embed(state_dict, model) incompatible_keys = model.load_state_dict(state_dict, strict=strict) return incompatible_keys diff --git a/src/open_clip/loss.py b/src/open_clip/loss.py index 9a4377cb6..638763597 100644 --- a/src/open_clip/loss.py +++ b/src/open_clip/loss.py @@ -159,8 +159,11 @@ def __init__( def forward(self, image_features, text_features, logits, labels, logit_scale, output_dict=False): - clip_loss = super().forward(image_features, text_features, logit_scale) - clip_loss = self.clip_loss_weight * clip_loss + clip_loss = torch.tensor(0) + + if self.clip_loss_weight: + clip_loss = super().forward(image_features, text_features, logit_scale) + clip_loss = self.clip_loss_weight * clip_loss caption_loss = self.caption_loss( logits.permute(0, 2, 1), diff --git a/src/training/main.py b/src/training/main.py index 2929d0121..4f2172808 100644 --- a/src/training/main.py +++ b/src/training/main.py @@ -232,7 +232,7 @@ def main(args): output_dict=True, ) if args.distill: - # FIXME: currenlty assumes the model your distilling from has the same tokenizer & transforms. + # FIXME: currently assumes the model you're distilling from has the same tokenizer & transforms. dist_model, _, _ = create_model_and_transforms( args.distill_model, args.distill_pretrained, diff --git a/src/training/params.py b/src/training/params.py index 31c841791..c33312f80 100644 --- a/src/training/params.py +++ b/src/training/params.py @@ -370,13 +370,13 @@ def parse_args(args): "--lock-text-unlocked-layers", type=int, default=0, - help="Leave last n image tower layer groups unlocked.", + help="Leave last n text tower layer groups unlocked.", ) parser.add_argument( "--lock-text-freeze-layer-norm", default=False, action='store_true', - help="Freeze BatchNorm running stats in image tower for any locked layers.", + help="Freeze BatchNorm running stats in text tower for any locked layers.", ) parser.add_argument( "--log-every-n-steps",