old action log probs should be the true distribution in the kl div lo…

…ss, addressing #43
lucidrains · Mar 22, 2023 · d4faf48 · d4faf48
1 parent ad001d8
commit d4faf48
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/palm_rlhf_pytorch/ppo.py b/palm_rlhf_pytorch/ppo.py
@@ -505,7 +505,7 @@ def learn(
                 kl_div_loss = 0.
 
                 if self.kl_div_loss_weight > 0:
-                    kl_div_loss = masked_kl_div(action_probs, old_action_probs, mask = action_masks) * self.kl_div_loss_weight
+                    kl_div_loss = masked_kl_div(old_action_probs, action_probs, mask = action_masks) * self.kl_div_loss_weight
 
                 # handle non-pooled values
 

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'PaLM-rlhf-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.1.2',
+  version = '0.1.4',
   license='MIT',
   description = 'PaLM + Reinforcement Learning with Human Feedback - Pytorch',
   author = 'Phil Wang',