diff --git a/README.md b/README.md
index 2e4ac9955..4922e47b8 100644
--- a/README.md
+++ b/README.md
@@ -121,7 +121,7 @@ You can then install openclip for training with `pip install 'open_clip_torch[tr
 
 #### Development
 
-If you want to make changes to contribute code, you can close openclip then run `make install` in openclip folder (after creating a virtualenv)
+If you want to make changes to contribute code, you can clone openclip then run `make install` in openclip folder (after creating a virtualenv)
 
 Install pip PyTorch as per https://pytorch.org/get-started/locally/
 
@@ -168,7 +168,7 @@ python -m training.main \
 ```
 
 Note: `imagenet-val` is the path to the *validation* set of ImageNet for zero-shot evaluation, not the training set!
-You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it doest not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh).
+You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it does not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh).
 
 ### Multi-GPU and Beyond
 
@@ -183,7 +183,7 @@ numerical results as the naïve method.
 
 #### Epochs
 
-For larger datasets (eg Laion2B), we recommend setting --train-num-samples to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1/16 of an epoch in conjunction with --dataset-resampled to do sampling with replacement. This allows having frequent checkpoints to evaluate more often.
+For larger datasets (eg Laion2B), we recommend setting `--train-num-samples` to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1/16 of an epoch in conjunction with `--dataset-resampled` to do sampling with replacement. This allows having frequent checkpoints to evaluate more often.
 
 #### Patch Dropout
 
@@ -196,7 +196,7 @@ In the paper, they also finetuned without the patch dropout at the end. You can
 #### Multiple data sources
 
 OpenCLIP supports using multiple data sources, by separating different data paths with `::`.
-For instance, to train on CC12M and on LAION, one might use `--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar'::/data/LAION-400M/{00000..41455}.tar"`.
+For instance, to train on CC12M and on LAION, one might use `--train-data "/data/cc12m/cc12m-train-{0000..2175}.tar::/data/LAION-400M/{00000..41455}.tar"`.
 Using `--dataset-resampled` is recommended for these cases.
 
 By default, on expectation the amount of times the model will see a sample from each source is proportional to the size of the source.
@@ -591,86 +591,93 @@ Future trained models will use nn.GELU.
 ```python
 >>> import open_clip
 >>> open_clip.list_pretrained()
-[('RN50', 'openai'),
-('RN50', 'yfcc15m'),
-('RN50', 'cc12m'),
-('RN50-quickgelu', 'openai'),
-('RN50-quickgelu', 'yfcc15m'),
-('RN50-quickgelu', 'cc12m'),
-('RN101', 'openai'),
-('RN101', 'yfcc15m'),
-('RN101-quickgelu', 'openai'),
-('RN101-quickgelu', 'yfcc15m'),
-('RN50x4', 'openai'),
-('RN50x16', 'openai'),
-('RN50x64', 'openai'),
-('ViT-B-32', 'openai'),
-('ViT-B-32', 'laion400m_e31'),
-('ViT-B-32', 'laion400m_e32'),
-('ViT-B-32', 'laion2b_e16'),
-('ViT-B-32', 'laion2b_s34b_b79k'),
-('ViT-B-32', 'datacomp_m_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
-('ViT-B-32', 'commonpool_m_s128m_b4k'),
-('ViT-B-32', 'datacomp_s_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_image_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_text_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_basic_s13m_b4k'),
-('ViT-B-32', 'commonpool_s_s13m_b4k'),
-('ViT-B-32-quickgelu', 'openai'),
-('ViT-B-32-quickgelu', 'laion400m_e31'),
-('ViT-B-32-quickgelu', 'laion400m_e32'),
-('ViT-B-16', 'openai'),
-('ViT-B-16', 'laion400m_e31'),
-('ViT-B-16', 'laion400m_e32'),
-('ViT-B-16', 'laion2b_s34b_b88k'),
-('ViT-B-16', 'datacomp_l_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_clip_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_laion_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_image_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_text_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_basic_s1b_b8k'),
-('ViT-B-16', 'commonpool_l_s1b_b8k'),
-('ViT-B-16-plus-240', 'laion400m_e31'),
-('ViT-B-16-plus-240', 'laion400m_e32'),
-('ViT-L-14', 'openai'),
-('ViT-L-14', 'laion400m_e31'),
-('ViT-L-14', 'laion400m_e32'),
-('ViT-L-14', 'laion2b_s32b_b82k'),
-('ViT-L-14', 'datacomp_xl_s13b_b90k'),
-('ViT-L-14', 'commonpool_xl_clip_s13b_b90k'),
-('ViT-L-14', 'commonpool_xl_laion_s13b_b90k'),
-('ViT-L-14', 'commonpool_xl_s13b_b90k'),
-('ViT-L-14-336', 'openai'),
-('ViT-H-14', 'laion2b_s32b_b79k'),
-('ViT-g-14', 'laion2b_s12b_b42k'),
-('ViT-g-14', 'laion2b_s34b_b88k'),
-('ViT-bigG-14', 'laion2b_s39b_b160k'),
-('roberta-ViT-B-32', 'laion2b_s12b_b32k'),
-('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'),
-('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'),
-('convnext_base', 'laion400m_s13b_b51k'),
-('convnext_base_w', 'laion2b_s13b_b82k'),
-('convnext_base_w', 'laion2b_s13b_b82k_augreg'),
-('convnext_base_w', 'laion_aesthetic_s13b_b82k'),
-('convnext_base_w_320', 'laion_aesthetic_s13b_b82k'),
-('convnext_base_w_320', 'laion_aesthetic_s13b_b82k_augreg'),
-('convnext_large_d', 'laion2b_s26b_b102k_augreg'),
-('convnext_large_d_320', 'laion2b_s29b_b131k_ft'),
-('convnext_large_d_320', 'laion2b_s29b_b131k_ft_soup'),
-('convnext_xxlarge', 'laion2b_s34b_b82k_augreg'),
-('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_rewind'),
-('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_soup'),
-('coca_ViT-B-32', 'laion2b_s13b_b90k'),
-('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'),
-('coca_ViT-L-14', 'laion2b_s13b_b90k'),
-('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k')
+[('RN50', 'openai'), 
+('RN50', 'yfcc15m'), 
+('RN50', 'cc12m'), 
+('RN50-quickgelu', 'openai'), 
+('RN50-quickgelu', 'yfcc15m'), 
+('RN50-quickgelu', 'cc12m'), 
+('RN101', 'openai'), 
+('RN101', 'yfcc15m'), 
+('RN101-quickgelu', 'openai'), 
+('RN101-quickgelu', 'yfcc15m'), 
+('RN50x4', 'openai'), 
+('RN50x16', 'openai'), 
+('RN50x64', 'openai'), 
+('ViT-B-32', 'openai'), 
+('ViT-B-32', 'laion400m_e31'), 
+('ViT-B-32', 'laion400m_e32'), 
+('ViT-B-32', 'laion2b_e16'), 
+('ViT-B-32', 'laion2b_s34b_b79k'), 
+('ViT-B-32', 'datacomp_m_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_clip_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_laion_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_image_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_text_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_basic_s128m_b4k'), 
+('ViT-B-32', 'commonpool_m_s128m_b4k'), 
+('ViT-B-32', 'datacomp_s_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_clip_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_laion_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_image_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_text_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_basic_s13m_b4k'), 
+('ViT-B-32', 'commonpool_s_s13m_b4k'), 
+('ViT-B-32-quickgelu', 'openai'), 
+('ViT-B-32-quickgelu', 'laion400m_e31'), 
+('ViT-B-32-quickgelu', 'laion400m_e32'), 
+('ViT-B-16', 'openai'), 
+('ViT-B-16', 'laion400m_e31'), 
+('ViT-B-16', 'laion400m_e32'), 
+('ViT-B-16', 'laion2b_s34b_b88k'), 
+('ViT-B-16', 'datacomp_l_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_clip_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_laion_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_image_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_text_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_basic_s1b_b8k'), 
+('ViT-B-16', 'commonpool_l_s1b_b8k'), 
+('ViT-B-16-plus-240', 'laion400m_e31'), 
+('ViT-B-16-plus-240', 'laion400m_e32'), 
+('ViT-L-14', 'openai'), 
+('ViT-L-14', 'laion400m_e31'), 
+('ViT-L-14', 'laion400m_e32'), 
+('ViT-L-14', 'laion2b_s32b_b82k'), 
+('ViT-L-14', 'datacomp_xl_s13b_b90k'), 
+('ViT-L-14', 'commonpool_xl_clip_s13b_b90k'), 
+('ViT-L-14', 'commonpool_xl_laion_s13b_b90k'), 
+('ViT-L-14', 'commonpool_xl_s13b_b90k'), 
+('ViT-L-14-336', 'openai'), 
+('ViT-H-14', 'laion2b_s32b_b79k'), 
+('ViT-g-14', 'laion2b_s12b_b42k'), 
+('ViT-g-14', 'laion2b_s34b_b88k'), 
+('ViT-bigG-14', 'laion2b_s39b_b160k'), 
+('roberta-ViT-B-32', 'laion2b_s12b_b32k'), 
+('xlm-roberta-base-ViT-B-32', 'laion5b_s13b_b90k'), 
+('xlm-roberta-large-ViT-H-14', 'frozen_laion5b_s13b_b90k'), 
+('convnext_base', 'laion400m_s13b_b51k'), 
+('convnext_base_w', 'laion2b_s13b_b82k'), 
+('convnext_base_w', 'laion2b_s13b_b82k_augreg'), 
+('convnext_base_w', 'laion_aesthetic_s13b_b82k'), 
+('convnext_base_w_320', 'laion_aesthetic_s13b_b82k'), 
+('convnext_base_w_320', 'laion_aesthetic_s13b_b82k_augreg'), 
+('convnext_large_d', 'laion2b_s26b_b102k_augreg'), 
+('convnext_large_d_320', 'laion2b_s29b_b131k_ft'), 
+('convnext_large_d_320', 'laion2b_s29b_b131k_ft_soup'), 
+('convnext_xxlarge', 'laion2b_s34b_b82k_augreg'), 
+('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_rewind'), 
+('convnext_xxlarge', 'laion2b_s34b_b82k_augreg_soup'), 
+('coca_ViT-B-32', 'laion2b_s13b_b90k'), 
+('coca_ViT-B-32', 'mscoco_finetuned_laion2b_s13b_b90k'), 
+('coca_ViT-L-14', 'laion2b_s13b_b90k'), 
+('coca_ViT-L-14', 'mscoco_finetuned_laion2b_s13b_b90k'), 
+('EVA01-g-14', 'laion400m_s11b_b41k'), 
+('EVA01-g-14-plus', 'merged2b_s11b_b114k'), 
+('EVA02-B-16', 'merged2b_s8b_b131k'), 
+('EVA02-L-14', 'merged2b_s4b_b131k'), 
+('EVA02-L-14-336', 'merged2b_s6b_b61k'), 
+('EVA02-E-14', 'laion2b_s4b_b115k'), 
+('EVA02-E-14-plus', 'laion2b_s9b_b144k')
 ]
 
 >>> model, train_transform, eval_transform = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
@@ -806,6 +813,16 @@ If you found this repository useful, please consider citing:
 }
 ```
 
+```bibtex
+@inproceedings{cherti2023reproducible,
+  title={Reproducible scaling laws for contrastive language-image learning},
+  author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2818--2829},
+  year={2023}
+}
+```
+
 ```bibtex
 @inproceedings{Radford2021LearningTV,
   title={Learning Transferable Visual Models From Natural Language Supervision},
diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py
index 9c141256b..8eb507576 100644
--- a/src/open_clip/factory.py
+++ b/src/open_clip/factory.py
@@ -102,6 +102,10 @@ def load_checkpoint(model, checkpoint_path, strict=True):
     # detect old format and make compatible with new format
     if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
         state_dict = convert_to_custom_text_state_dict(state_dict)
+    # Certain text transformers no longer expect position_ids after transformers==4.31
+    position_id_key = 'text.transformer.embeddings.position_ids'
+    if position_id_key in state_dict and not hasattr(model, position_id_key):
+        del state_dict[position_id_key]
     resize_pos_embed(state_dict, model)
     incompatible_keys = model.load_state_dict(state_dict, strict=strict)
     return incompatible_keys
diff --git a/src/open_clip/loss.py b/src/open_clip/loss.py
index 9a4377cb6..638763597 100644
--- a/src/open_clip/loss.py
+++ b/src/open_clip/loss.py
@@ -159,8 +159,11 @@ def __init__(
 
     def forward(self, image_features, text_features, logits, labels, logit_scale, output_dict=False):
 
-        clip_loss = super().forward(image_features, text_features, logit_scale)
-        clip_loss = self.clip_loss_weight * clip_loss
+        clip_loss = torch.tensor(0)
+
+        if self.clip_loss_weight:
+            clip_loss = super().forward(image_features, text_features, logit_scale)
+            clip_loss = self.clip_loss_weight * clip_loss
 
         caption_loss = self.caption_loss(
             logits.permute(0, 2, 1),
diff --git a/src/training/main.py b/src/training/main.py
index 2929d0121..4f2172808 100644
--- a/src/training/main.py
+++ b/src/training/main.py
@@ -232,7 +232,7 @@ def main(args):
         output_dict=True,
     )
     if args.distill:
-        # FIXME: currenlty assumes the model your distilling from has the same tokenizer & transforms.
+        # FIXME: currently assumes the model you're distilling from has the same tokenizer & transforms.
         dist_model, _, _ = create_model_and_transforms(
             args.distill_model, 
             args.distill_pretrained,
diff --git a/src/training/params.py b/src/training/params.py
index 31c841791..c33312f80 100644
--- a/src/training/params.py
+++ b/src/training/params.py
@@ -370,13 +370,13 @@ def parse_args(args):
         "--lock-text-unlocked-layers",
         type=int,
         default=0,
-        help="Leave last n image tower layer groups unlocked.",
+        help="Leave last n text tower layer groups unlocked.",
     )
     parser.add_argument(
         "--lock-text-freeze-layer-norm",
         default=False,
         action='store_true',
-        help="Freeze BatchNorm running stats in image tower for any locked layers.",
+        help="Freeze BatchNorm running stats in text tower for any locked layers.",
     )
     parser.add_argument(
         "--log-every-n-steps",