Release v0.11.1

As went to CRAN: * Update the apparel example data * Prediction bootstrapping: Calculate confidence intervals using regular rather than "reversed-quantiles" * Prediction bootstrapping: Re-fit model using exact original specification * GGomNBD: Set limit in integration method to size of workspace
bachmannpatrick · Oct 13, 2024 · 38d23a3 · 38d23a3
2 parents af088c4 + 7b13db1
commit 38d23a3
Show file tree

Hide file tree

Showing 106 changed files with 1,027 additions and 1,423 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CLVTools
 Title: Tools for Customer Lifetime Value Estimation
-Version: 0.11.0
-Date: 2024-08-15
+Version: 0.11.1
+Date: 2024-10-10
 Authors@R: c(
     person(given="Patrick", family="Bachmann",    email = "pbachma@ethz.ch", role = c("cre","aut")),
     person(given="Niels",   family="Kuebler",     email = "niels.kuebler@uzh.ch", role = "aut"),
@@ -154,7 +154,7 @@ Collate:
     'pnbd_dyncov_createwalks.R'
     'pnbd_dyncov_expectation.R'
     'pnbd_dyncov_palive.R'
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 VignetteBuilder: knitr
 Config/testthat/parallel: false
 Config/testthat/edition: 3
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,15 @@
+# CLVTools 0.11.1
+
+### NEW FEATURES
+* Updated the apparel example data
+* Prediction bootstrapping: Calculate confidence intervals using regular rather than "reversed-quantiles"
+
+### BUG FIXES
+* Prediction bootstrapping: Re-fit model using exact original specification
+* GGomNBD: Set limit in integration method to size of workspace
+
+
+
 # CLVTools 0.11.0
 
 ### NEW FEATURES

diff --git a/R/clv_template_controlflow_predict.R b/R/clv_template_controlflow_predict.R
@@ -75,40 +75,33 @@ clv.controlflow.predict.add.uncertainty.estimates <- function(clv.fitted, dt.pre
     message("Calculating confidence intervals...")
   }
 
-  # Customers that are sampled multiple times are added to the boostrapping data with suffix "_BOOTSID_<i?"
+  # Customers that are sampled multiple times are added to the boostrapping data with suffix "_BOOTSID_<i>"
   # Remove this suffix again to get the original Id and calculate the quantiles across a single customers multiple draws
   # regex: "ends with _BOOTSTRAP_ID_<one or more digits>"
   dt.boots[, Id := sub("_BOOTSTRAP_ID_[0-9]+$", "", Id)]
 
-  # quantiles for each predicted quantity
-  # select only the existing ones
+  # quantiles for each predicted quantity: select only the existing ones
   cols.predictions <- c("PAlive", "CET", "DERT", "DECT", "predicted.mean.spending", "predicted.total.spending", "predicted.CLV")
   cols.predictions <- cols.predictions[cols.predictions %in% colnames(dt.boots)]
 
   # Long-format for easier handling of different prediction columns
   dt.boots <- melt(dt.boots, id.vars="Id", measure.vars=cols.predictions, variable.name="variable", value.name="value")
-  dt.predictions.long <- melt(dt.predictions, id.vars="Id", measure.vars=cols.predictions, variable.name="variable", value.name="value")
 
-  # Calculate quantiles for each customer and prediction column
-  #
-  # Reversed quantiles
-  #   [theta_star - q_upper(diff), theta_star - q_lower(diff)]
-  #   where diff = theta_boot - theta_star
-  # Note that q_upper is used for the lower boundary and q_lower for the upper boundary while subtracting in both cases.
-  # Therefore quantile(probs=) is reversed.
+  ci.levels <- c((1-level)/2, 1-(1-level)/2)
 
-  # Calculate difference between bootstrapped and regular predictions
-  dt.boots[dt.predictions.long, value.star := i.value, on=c("Id", "variable")]
-  dt.boots[, value.diff := value - value.star]
-
-  levels <- c((1-level)/2, 1-(1-level)/2)
-  name.levels <- paste0(".CI.", levels*100) # outside table to avoid doing it for each customer
+  # create names outside table to avoid doing it for each customer
+  # only post-fix which is then appended to the content of col `variable`
+  ci.post.fixes <- paste0(".CI.", ci.levels*100)
 
+  # Calculate quantiles for each customer and prediction column, using
+  # ordinary quantiles
   dt.CI <- dt.boots[, list(
-    ci.name=name.levels,
-    # Have to use value.star[1] because there are >1 row if sampled more than once.
-    # names=FALSE is considerably faster.
-    ci.value = value.star[1] - quantile(value.diff, probs = rev(levels), names = FALSE)),
+    # store the lower and upper CI name directly with the calculated value
+    # this might could be moved to `ci.name := paste0(variable, ci.name)` but to
+    # be sure the
+    ci.name=ci.post.fixes,
+    # names=FALSE is considerably faster
+    ci.value = quantile(value, probs = ci.levels, names = FALSE)),
     keyby=c("Id", "variable")]
 
   # Presentable names

diff --git a/R/data.R b/R/data.R
@@ -26,10 +26,10 @@
 #'
 #' @description
 #' This is a simulated dataset containing the entire purchase history of customers made their first purchase at an
-#' apparel retailer on January 3rd 2005. In total the dataset contains 250 customers who made
-#' 3648 transactions between January 2005 and mid July 2006.
+#' apparel retailer on January 2nd 2005. In total the dataset contains 600 customers who made
+#' 3,187 transactions between January 2005 and end of December 2010.
 #'
-#' @format A \code{data.table} with 2353 rows and 3 variables:
+#' @format A \code{data.table} with 3,187 rows and 3 variables:
 #' \describe{
 #'   \item{\code{Id}}{Customer Id}
 #'   \item{\code{Date}}{Date of purchase}
@@ -45,12 +45,12 @@
 
 #' @name apparelStaticCov
 #' @title Time-invariant Covariates for the Apparel Retailer Dataset
-t
+
 #' @description
-#' This simulated data contains additional demographic information on all 250 customers in the
+#' This simulated data contains additional demographic information on all 600 customers in the
 #' "apparelTrans" dataset. This information can be used as time-invariant covariates.
 #'
-#' @format A \code{data.table} with 250 rows and 3 variables:
+#' @format A \code{data.table} with 600 rows and 3 variables:
 #'
 #' \describe{
 #'   \item{Id}{Customer Id}
@@ -68,14 +68,14 @@ t
 #' @title Time-varying Covariates for the Apparel Retailer Dataset
 
 #' @description
-#' This simulated data contains direct marketing information on all 250 customers in the "apparelTrans" dataset.
+#' This simulated data contains seasonal information and additional covariates on all 600 customers in the "apparelTrans" dataset.
 #' This information can be used as time-varying covariates.
 #'
-#' @format A data.table with 20500 rows and 5 variables
+#' @format A data.table with 187,800 rows and 5 variables
 #' \describe{
 #'   \item{Id}{Customer Id}
 #'   \item{Cov.Date}{Date of contextual factor}
-#'   \item{Marketing}{Direct marketing variable: number of times a customer was contacted with direct marketing in this time period}
+#'   \item{High.Season}{Seasonal variable: 1 indicating a time-period that is considered "high season".}
 #'   \item{Gender}{0=male, 1=female}
 #'   \item{Channel}{Acquisition channel: 0=online, 1=offline}
 #' }
@@ -84,3 +84,24 @@ t
 #' @usage data("apparelDynCov")
 #' @docType data
 "apparelDynCov"
+
+#' @name apparelDynCovFuture
+#' @title Future Time-varying Covariates for the Apparel Retailer Dataset
+
+#' @description
+#' This simulated data contains seasonal information and additional covariates on all 600 customers in the "apparelTrans" after the last transaction in the dataset.
+#' This information can be used as time-varying covariates for prediction future customer behavior.
+#'
+#' @format A data.table with 56,400 rows and 5 variables
+#' \describe{
+#'   \item{Id}{Customer Id}
+#'   \item{Cov.Date}{Date of contextual factor}
+#'   \item{High.Season}{Seasonal variable: 1 indicating a time-period that is considered "high season".}
+#'   \item{Gender}{0=male, 1=female}
+#'   \item{Channel}{Acquisition channel: 0=online, 1=offline}
+#' }
+#'
+#' @keywords datasets
+#' @usage data("apparelDynCovFuture")
+#' @docType data
+"apparelDynCovFuture"
diff --git a/R/f_generics_clvfitted.R b/R/f_generics_clvfitted.R
@@ -23,9 +23,11 @@ setMethod(f = "clv.controlflow.predict.set.prediction.params", signature = signa
 setMethod("clv.fitted.estimate.same.specification.on.new.data", signature = "clv.fitted", def = function(clv.fitted, newdata, ...){
   cl <- match.call(expand.dots = TRUE)
 
+  # args to model function are the original specification args + `newdata` as
+  # actual data arg
   args <- c(clv.fitted@model.specification.args, list(clv.data=newdata))
 
-  # overwrite with what was passed
+  # overwrite call args with what was passed
   args <- modifyList(args, val = list(...), keep.null = TRUE)
 
   new.fitted <- do.call(what = clv.fitted@clv.model@fn.model.generic, args=args)

diff --git a/R/f_generics_clvfittedspending.R b/R/f_generics_clvfittedspending.R
@@ -103,6 +103,7 @@ setMethod("clv.controlflow.predict.post.process.prediction.table", signature = s
 setMethod(f = "clv.fitted.bootstrap.predictions",signature = signature(clv.fitted="clv.fitted.spending"), definition = function(clv.fitted, num.boots, verbose){
 
   # Largely the same as for clv.fitted.transactions but with different arguments to predict()
+  # See there for more in-depth comments
 
 
   if(verbose){
@@ -117,9 +118,11 @@ setMethod(f = "clv.fitted.bootstrap.predictions",signature = signature(clv.fitte
   }
   pb.i <- 0
 
+
   boots.predict <- function(clv.boot){
     pb.i <<- pb.i + 1
     update.pb(n = pb.i)
+
     return(predict(
       object = clv.boot,
       verbose = FALSE,
@@ -130,9 +133,9 @@ setMethod(f = "clv.fitted.bootstrap.predictions",signature = signature(clv.fitte
     object = clv.fitted,
     num.boots = num.boots,
     fn.boot.apply = boots.predict,
-    fn.sample = NULL,
     verbose = FALSE,
-    start.params.model = clv.fitted@prediction.params.model
+    fn.sample = NULL
   )
+
   return(rbindlist(l.boots))
 })
diff --git a/R/f_generics_clvfittedtransactions.R b/R/f_generics_clvfittedtransactions.R
@@ -344,31 +344,25 @@ setMethod("clv.controlflow.predict.new.customer", signature = signature(clv.fitt
 # . clv.fitted.bootstrap.predictions --------------------------------------------
 setMethod(f = "clv.fitted.bootstrap.predictions", signature = signature(clv.fitted="clv.fitted.transactions"), definition = function(clv.fitted, num.boots, verbose, prediction.end, predict.spending, continuous.discount.factor){
 
-  # have to explicitly give prediction.end because bootstrapping data has no holdout
-  if(is.null(prediction.end)){
-    boots.prediction.end <- clv.fitted@clv.data@clv.time@timepoint.holdout.end
-  }else{
-    boots.prediction.end <- prediction.end
-  }
-
   if(verbose){
     # Print message before progress bar is created
     message("Bootstrapping ",num.boots," times for uncertainty estimates...")
 
     progress.bar <- txtProgressBar(max = num.boots, style = 3)
     update.pb    <- function(n){setTxtProgressBar(pb=progress.bar, value = n)}
   }else{
-    # has to be also defined if verbose=F because used in boots.predict
+    # also has to be defined if verbose=F because used in boots.predict
     update.pb <- function(n){}
   }
   pb.i <- 0
 
+  # Method that is called on the bootstrapped data
   boots.predict <- function(clv.boot){
     pb.i <<- pb.i + 1
     update.pb(n = pb.i)
     return(predict(
       object = clv.boot,
-      prediction.end = boots.prediction.end,
+      prediction.end = prediction.end,
       verbose = FALSE,
       predict.spending = predict.spending,
       continuous.discount.factor = continuous.discount.factor,
@@ -377,11 +371,12 @@ setMethod(f = "clv.fitted.bootstrap.predictions", signature = signature(clv.fitt
 
   l.boots <- clv.bootstrapped.apply(
     object = clv.fitted,
+    fn.sample = NULL,
     num.boots = num.boots,
     fn.boot.apply = boots.predict,
-    fn.sample = NULL,
-    verbose = FALSE,
-    start.params.model = clv.fitted@prediction.params.model
+    # Fitting on bootstrapped data: Never verbose because does not mix well
+    # with status bar shown when verbose=TRUE
+    verbose = FALSE
   )
 
   return(rbindlist(l.boots))

diff --git a/R/f_interface_lrtest.R b/R/f_interface_lrtest.R
@@ -7,11 +7,12 @@
 #' If not given, the default model names are used.
 #'
 #' @description
-#' \code{lrtest} carrys out likelihood ratio tests to compare nested CLV models.
+#' \code{lrtest} carries out likelihood ratio tests to compare nested CLV models
+#' of the same family that were fitted on the same transaction data.
 #'
-#' The method consecutively compares the first model given in \code{object} with all the
-#' other models passed in \code{...}. An asymptotic likelihood ratio test is carried out:
-#' Twice the difference in log-likelihoods is compared with a Chi-squared distribution.
+#' The method compares each two consecutive models.
+#' An asymptotic likelihood ratio test is carried out: Twice the difference in
+#' log-likelihoods is compared with a Chi-squared distribution.
 #'
 #' @param object An fitted model object inheriting from \code{clv.fitted}.
 #' @param ... Other models objects fitted on the same transaction data

diff --git a/R/f_interface_newcustomer.R b/R/f_interface_newcustomer.R
@@ -31,7 +31,7 @@
 #' For \code{newcustomer.dynamic()}: One column for every covariate parameter in the estimated model.
 #' No column \code{Id}. A column \code{Cov.Date} with time points that mark the start of the period defined by \code{time.unit}.
 #' For every \code{Cov.Date}, exactly 1 row of numeric covariate data. \cr
-#' For example for weekly covariates: \code{data.frame(Cov.Date=c("2000-01-03", "2000-01-10"), Gender=c(1,1), High.Season=c(0, 1), Marketing=c(-0.5,1.12))} \cr
+#' For example for weekly covariates: \code{data.frame(Cov.Date=c("2000-01-03", "2000-01-10"), Gender=c(1,1), Channel=c(1, 1), High.Season=c(0,1,0))} \cr
 #' If \code{Cov.Date} is of type character, the \code{date.format} given when creating the the \code{clv.data} object is used to parse it.
 #' The data has to cover the time from the customer's first transaction \code{first.transaction}
 #' to the end of the prediction period given by \code{t}. It does not have to cover the same time range as when fitting the model.
@@ -53,7 +53,7 @@
 #' data("apparelDynCov")
 #'
 #' clv.data.apparel <- clvdata(apparelTrans, date.format = "ymd",
-#'                             time.unit = "w", estimation.split = 40)
+#'                             time.unit = "w", estimation.split = 52)
 #' clv.data.static.cov <-
 #'  SetStaticCovariates(clv.data.apparel,
 #'                      data.cov.life = apparelStaticCov,
@@ -64,8 +64,8 @@
 #'   SetDynamicCovariates(clv.data = clv.data.apparel,
 #'                        data.cov.life = apparelDynCov,
 #'                        data.cov.trans = apparelDynCov,
-#'                        names.cov.life = c("Marketing", "Gender"),
-#'                        names.cov.trans = c("Marketing", "Gender"),
+#'                        names.cov.life = c("High.Season", "Gender"),
+#'                        names.cov.trans = c("High.Season", "Gender"),
 #'                        name.date = "Cov.Date")
 #'
 #'
@@ -105,8 +105,8 @@
 #' p.apparel.dyn <- pnbd(clv.data.dyn.cov)
 #'
 #' # Predict the number of transactions an average new
-#' # customer who is male (Gender=0), who was contacted
-#' # 4, 0, and 7 times with direct marketing, and who was
+#' # customer who is male (Gender=0), who did not purchase during
+#' # high.season, and who was
 #' # acquired on "2005-02-16" (first.transaction) is expected
 #' # to make in the first 2.12 weeks.
 #' # Note that the time range is very different from the one used
@@ -119,11 +119,11 @@
 #'     data.cov.life=data.frame(
 #'       Cov.Date=c("2051-02-12", "2051-02-19", "2051-02-26"),
 #'       Gender=c(0, 0, 0),
-#'       Marketing=c(4, 0, 7)),
+#'       High.Season=c(4, 0, 7)),
 #'     data.cov.trans=data.frame(
 #'       Cov.Date=c("2051-02-12", "2051-02-19", "2051-02-26"),
 #'       Gender=c(0, 0, 0),
-#'       Marketing=c(4, 0, 7)),
+#'       High.Season=c(4, 0, 7)),
 #'     first.transaction = "2051-02-16"
 #'   )
 #' )

diff --git a/R/f_interface_pnbd.R b/R/f_interface_pnbd.R
@@ -16,7 +16,7 @@
 #'
 #'
 #' @details
-#' Model parameters for the Pareto/NBD model are \code{alpha, r, beta, and s}. \cr
+#' Model parameters for the Pareto/NBD model are \code{r, alpha, s, and beta}. \cr
 #' \code{s}: shape parameter of the Gamma distribution for the lifetime process.
 #' The smaller s, the stronger the heterogeneity of customer lifetimes. \cr
 #' \code{beta}: rate parameter for the Gamma distribution for the lifetime process. \cr
@@ -104,8 +104,8 @@
 #'   SetDynamicCovariates(clv.data = clv.data.apparel,
 #'                        data.cov.life = apparelDynCov,
 #'                        data.cov.trans = apparelDynCov,
-#'                        names.cov.life = c("Marketing", "Gender", "Channel"),
-#'                        names.cov.trans = c("Marketing", "Gender", "Channel"),
+#'                        names.cov.life = c("High.Season", "Gender", "Channel"),
+#'                        names.cov.trans = c("High.Season", "Gender", "Channel"),
 #'                        name.date = "Cov.Date")
 #'
 #'

diff --git a/R/f_interface_predict_clvfittedspending.R b/R/f_interface_predict_clvfittedspending.R
@@ -32,7 +32,7 @@
 #'
 #' # Fit gg model on data
 #' apparel.holdout <- clvdata(apparelTrans, time.unit="w",
-#'                            estimation.split=37, date.format="ymd")
+#'                            estimation.split = 52, date.format = "ymd")
 #' apparel.gg <- gg(apparel.holdout)
 #'
 #' # Predict customers' future mean spending per transaction

diff --git a/R/f_interface_predict_clvfittedtransactions.R b/R/f_interface_predict_clvfittedtransactions.R
@@ -80,12 +80,21 @@
 #' These provide an estimate of parameter uncertainty.
 #' To create bootstrapped data, customer ids are sampled with replacement until reaching original
 #' length and all transactions of the sampled customers are used to create a new \code{clv.data} object.
-#' A new model is fit on the bootstrapped data with the same specification as \code{object}
-#' (incl. start parameters and `optimx.args`) and it is then used to predict on this data.
+#' A new model is fit on the bootstrapped data with the exact same specification as used when
+#' fitting \code{object} (incl. start parameters and `optimx.args`) and it is then used to predict on this data.
+#'
+#' It is highly recommended to fit the original model (\code{object}) with a robust optimization
+#' method, such as Nelder-Mead (\code{optimx.args=list(method='Nelder-Mead')}).
+#' This ensures that the model can also be fit on the bootstrapped data.
+#'
 #' All prediction parameters, incl \code{prediction.end} and \code{continuous.discount.factor}, are forwarded
 #' to the prediction on the bootstrapped data.
-#' Per customer, confidence intervals of each predicted metric are created using a "reversed quantile" approach.
+#' Per customer, the boundaries of the confidence intervals of each predicted metric are the
+#' sample quantiles (\code{quantile(x, probs=c((1-level)/2, 1-(1-level)/2)}).
+#'
 #' See \link{clv.bootstrapped.apply} to create a custom bootstrapping procedure.
+#'
+
 #'
 #'
 #' @seealso models to predict transactions: \link{pnbd}, \link{bgnbd}, \link{ggomnbd}.
@@ -120,7 +129,7 @@
 #' data("apparelTrans")
 #' # Fit pnbd standard model on data, WITH holdout
 #' apparel.holdout <- clvdata(apparelTrans, time.unit="w",
-#'                            estimation.split=37, date.format="ymd")
+#'                            estimation.split=52, date.format="ymd")
 #' apparel.pnbd <- pnbd(apparel.holdout)
 #'
 #' # Predict until the end of the holdout period