diff --git a/NEWS.md b/NEWS.md index de14313..a3ae20f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,8 @@ contextual 0.9.8.4 ================== * Minor documentation updates. -* Fix for Exp3 bug (thanks, @leferrad !) +* Fix for Exp3 bug (thanks, @leferrad) +* Cleanup of propensity score related code (thanks again, @leferrad) * Updated tests. @@ -11,7 +12,7 @@ contextual 0.9.8.3 * Tested and confirmed to be R 4.0.0 proof. * Minor documentation updates. -* Now correctly restores global seed on completing a simulation (thanks, @pstansell !) +* Now correctly restores global seed on completing a simulation (thanks, @pstansell) contextual 0.9.8.2 diff --git a/R/bandit_offline_doubly_robust.R b/R/bandit_offline_doubly_robust.R index abffa6b..488ef1b 100644 --- a/R/bandit_offline_doubly_robust.R +++ b/R/bandit_offline_doubly_robust.R @@ -9,15 +9,13 @@ OfflineDoublyRobustBandit <- R6::R6Class( n = NULL ), public = list( - inverted = NULL, threshold = NULL, class_name = "OfflineDoublyRobustBandit", initialize = function(formula, data, k = NULL, d = NULL, unique = NULL, shared = NULL, - inverted = FALSE, threshold = 0, + threshold = 0, randomize = TRUE) { - self$inverted <- inverted self$threshold <- threshold super$initialize(formula, data, k, d, @@ -44,15 +42,11 @@ OfflineDoublyRobustBandit <- R6::R6Class( p <- private$p[index] indicator <- ind(private$z[index] == choice) if (indicator) { - p <- private$p[index] - if (self$inverted) p <- 1 / p - if (self$threshold > 0) { - if (isTRUE(self$inverted)) p <- 1 / p - p <- 1 / max(p,self$threshold) - } else { - if (!isTRUE(self$inverted)) p <- 1 / p - } - prop_reward <- (data_reward - model_reward) * p + + p <- max(private$p[index], self$threshold) # when threshold 0 (default) + # p = private$p[index] + w <- 1 / p + prop_reward <- (data_reward - model_reward) * w } else { prop_reward <- 0 } @@ -77,7 +71,7 @@ OfflineDoublyRobustBandit <- R6::R6Class( #' bandit <- OfflineDoublyRobustBandit(formula, #' data, k = NULL, d = NULL, #' unique = NULL, shared = NULL, -#' inverted = FALSE, randomize = TRUE) +#' randomize = TRUE) #' } #' #' @section Arguments: @@ -121,9 +115,6 @@ OfflineDoublyRobustBandit <- R6::R6Class( #' \item{\code{shared}}{ #' integer vector; index of shared features (optional) #' } -#' \item{\code{inverted}}{ -#' logical; have the propensities been inverted (1/p) or not (p)? -#' } #' \item{\code{threshold}}{ #' float (0,1); Lower threshold or Tau on propensity score values. Smaller Tau makes for less biased #' estimates with more variance, and vice versa. For more information, see paper by Strehl at all (2010). diff --git a/R/bandit_offline_propensity_weighting.R b/R/bandit_offline_propensity_weighting.R index 260ed60..2d6f8b0 100644 --- a/R/bandit_offline_propensity_weighting.R +++ b/R/bandit_offline_propensity_weighting.R @@ -10,7 +10,6 @@ OfflinePropensityWeightingBandit <- R6::R6Class( ), public = list( class_name = "OfflinePropensityWeightingBandit", - inverted = NULL, threshold = NULL, drop_value = NULL, stabilized = NULL, @@ -18,10 +17,8 @@ OfflinePropensityWeightingBandit <- R6::R6Class( data, k = NULL, d = NULL, unique = NULL, shared = NULL, randomize = TRUE, replacement = FALSE, - jitter = FALSE, arm_multiply = FALSE, - inverted = FALSE, threshold = 0, + jitter = FALSE, arm_multiply = FALSE, threshold = 0, stabilized = TRUE, drop_unequal_arm = TRUE) { - self$inverted <- inverted self$threshold <- threshold self$stabilized <- stabilized if(isTRUE(drop_unequal_arm)) { @@ -47,20 +44,22 @@ OfflinePropensityWeightingBandit <- R6::R6Class( }, get_reward = function(index, context, action) { if (private$z[[index]] == action$choice) { - p <- private$p[index] - if (self$threshold > 0) { - if (isTRUE(self$inverted)) p <- 1 / p - p <- 1 / max(p,self$threshold) - } else { - if (!isTRUE(self$inverted)) p <- 1 / p - } + + p <- max(private$p[index], self$threshold) # when threshold 0 (default) + # p = private$p[index] + + w <- 1 / p + if (self$stabilized) { + inc(private$n) <- 1 - inc(private$p_hat) <- (p - private$p_hat) / private$n - prop_reward <- as.double((private$y[index]*p)/private$p_hat) + inc(private$p_hat) <- (w - private$p_hat) / private$n + prop_reward <- as.double((private$y[index]*w)/private$p_hat) + } else { - prop_reward <- as.double(private$y[index]*p) + prop_reward <- as.double(private$y[index]*w) } + list( reward = prop_reward, optimal_reward = ifelse(private$or, as.double(private$S$optimal_reward[[index]]), NA), @@ -94,8 +93,7 @@ OfflinePropensityWeightingBandit <- R6::R6Class( #' data, k = NULL, d = NULL, #' unique = NULL, shared = NULL, #' randomize = TRUE, replacement = TRUE, -#' jitter = TRUE, arm_multiply = TRUE, -#' inverted = FALSE) +#' jitter = TRUE, arm_multiply = TRUE) #' } #' #' @section Arguments: @@ -134,10 +132,7 @@ OfflinePropensityWeightingBandit <- R6::R6Class( #' \item{\code{arm_multiply}}{ #' logical; multiply the horizon by the number of arms (optional, default: TRUE) #' } -#' \item{\code{inverted}}{ -#' logical; have the propensity scores been weighted (optional, default: FALSE) -#' } - #' \item{\code{threshold}}{ +#' \item{\code{threshold}}{ #' float (0,1); Lower threshold or Tau on propensity score values. Smaller Tau makes for less biased #' estimates with more variance, and vice versa. For more information, see paper by Strehl at all (2010). #' Values between 0.01 and 0.05 are known to work well. @@ -168,7 +163,7 @@ OfflinePropensityWeightingBandit <- R6::R6Class( #' \describe{ #' #' \item{\code{new(formula, data, k = NULL, d = NULL, unique = NULL, shared = NULL, randomize = TRUE, -#' replacement = TRUE, jitter = TRUE, arm_multiply = TRUE, inverted = FALSE)}}{ +#' replacement = TRUE, jitter = TRUE, arm_multiply = TRUE)}}{ #' generates and instantializes a new \code{OfflinePropensityWeightingBandit} instance. } #' #' \item{\code{get_context(t)}}{ diff --git a/demo/demo_simpsons_paradox_propensity.R b/demo/demo_simpsons_paradox_propensity.R index 69a8b2b..8169656 100644 --- a/demo/demo_simpsons_paradox_propensity.R +++ b/demo/demo_simpsons_paradox_propensity.R @@ -214,13 +214,13 @@ print(paste("Movie:",round(sum(prop_dt[choice==2]$reward)/nrow(prop_dt[choice==2 # stop.method = "es.mean", verbose=FALSE) # b_dt$choice <- b_dt$choice + 1 # -# weights <- get.weights(ip, stop.method = "es.mean") # already inverted +# weights <- get.weights(ip, stop.method = "es.mean") # b_dt$p <- weights # # f <- formula("reward ~ choice | X.1 + X.2 | p") # # bandit <- OfflinePropensityWeightingBandit$new(formula = f, data = b_dt, -# k = 2 , d = 2, inverted = TRUE) +# k = 2 , d = 2) # policy <- EpsilonGreedyPolicy$new(0.1) # agent <- Agent$new(policy, bandit, "prop") #