openml · bakopyan · Sep 3, 2019 · Sep 9, 2019 · Sep 9, 2019 · Nov 5, 2019
diff --git a/R/convertOMLDataSetToMlr3.R b/R/convertOMLDataSetToMlr3.R
@@ -0,0 +1,123 @@
+#' @title Convert an OpenML data set to mlr3 task.
+#'
+#' @description
+#' Converts an \code{\link{OMLDataSet}} to a \code{\link[mlr3]{Task}}.
+#'
+#' @param obj [\code{\link{OMLDataSet}}]\cr
+#' The object that should be converted.
+#' @param mlr.task.id [\code{character(1)}]\cr
+#' Id string for \code{\link[mlr3]{Task}} object.
+#' The strings \code{<oml.data.name>}, \code{<oml.data.id>} and \code{<oml.data.version>}
+#' will be replaced by their respective values contained in the \code{\link{OMLDataSet}} object.
+#' Default is \code{<oml.data.name>}.
+#' @param task.type [\code{character(1)}]\cr
+#' As we only pass the data set, we need to define the task type manually.
+#' Possible are: \dQuote{Supervised Classification}, \dQuote{Supervised Regression},
+#' \dQuote{Survival Analysis}.
+#' Default is \code{NULL} which means to guess it from the target column in the
+#' data set. If that is a factor or a logical, we choose classification.
+#' If it is numeric we choose regression. In all other cases an error is thrown.
+#' @param target [\code{character}]\cr
+#' The target for the classification/regression task.
+#' Default is the \code{default.target.attribute} of the \code{\link{OMLDataSetDescription}}.
+#' @param ignore.flagged.attributes [\code{logical(1)}]\cr
+#' Should those features that are listed in the data set description slot \dQuote{ignore.attribute}
+#' be removed?
+#' Default is \code{TRUE}.
+#' @param drop.levels [\code{logical(1)}]\cr
+#' Should empty factor levels be dropped in the data?
+#' Default is \code{TRUE}.
+#' @param fix.colnames [\code{logical(1)}]\cr
+#' Should colnames of the data be fixed using \code{\link[base]{make.names}}?
+#' Default is \code{TRUE}.
+#' @template arg_verbosity
+#' @return [\code{\link[mlr3]{Task}}].
+#' @family data set-related functions
+#' @example /inst/examples/convertOMLDataSetToMlr3.R
+#' @export
+convertOMLDataSetToMlr3 = function(
+ obj,
+ mlr.task.id = "<oml.data.name>",
+ task.type = NULL,
+ target = obj$desc$default.target.attribute,
+ ignore.flagged.attributes = TRUE,
+ drop.levels = TRUE,
+ fix.colnames = TRUE,
+ verbosity = NULL) {
+
+ assertClass(obj, "OMLDataSet")
+ assertSubset(target, obj$colnames.new)
+ assertFlag(ignore.flagged.attributes)
+ assertFlag(drop.levels)
+
+ data = obj$data
+ desc = obj$desc
+
+ # no task type? guess it by looking at target
+ if (is.null(task.type))
+ task.type = guessTaskType(data[, target])
+ assertChoice(task.type, getValidTaskTypes())
+
+ # remove ignored attributes from data
+ if (any(!is.na(desc$ignore.attribute)) & ignore.flagged.attributes) {
+ keep.cols = obj$colnames.old %nin% desc$ignore.attribute
+ data = data[, keep.cols, drop = FALSE]
+ }
+
+ # drop levels
+ if (drop.levels)
+ data = droplevels(data)
+
+ # fix colnames using make.names
+ if (fix.colnames) {
+ colnames(data) = make.names(colnames(data), unique = TRUE)
+ target = make.names(target, unique = TRUE)
+ }
+
+ # get fixup verbose setting for mlr
+ if (is.null(verbosity))
+ verbosity = getOMLConfig()$verbosity
+ fixup = ifelse(verbosity == 0L, "quiet", "warn")
+
+ mlr.task = switch(task.type,
+ "Supervised Classification" = mlr3::TaskClassif$new(id = desc$name, backend = data, target = target),
+ "Supervised Regression" = mlr3::TaskRegr$new(id = desc$name, backend = data, target = target),
+ "Survival Analysis" = mlr3survival::TaskSurv$new(id = desc$name, backend = data, target = target),
+ stopf("Encountered currently unsupported task type: %s", task.type)
+ )
+
+ if (!is.null(mlr.task.id))
+ mlr.task$id = replaceOMLDataSetString(mlr.task.id, obj)
+
+ return(mlr.task)
+}
+
+replaceOMLDataSetString = function(string, data.set) {
+ string = stri_replace_all_fixed(string, "<oml.data.id>", data.set$desc$id)
+ string = stri_replace_all_fixed(string, "<oml.data.name>", data.set$desc$name)
+ stri_replace_all_fixed(string, "<oml.data.version>", data.set$desc$version)
+}
+
+# @title Helper to guess task type from target column format.
+#
+# @param target [vector]
+# Vector of target values.
+# @return [character(1)]
+guessTaskType = function(target) {
+ if (inherits(target, "data.frame")) {
+ assertDataFrame(target, types = "logical")
+ return("Multilabel")
+ } else {
+ if (is.factor(target) | is.logical(target))
+ return("Supervised Classification")
+ if (is.numeric(target))
+ return("Supervised Regression")
+ }
+
+ stopf("Cannot guess task.type from data!")
+}
+
+getValidTaskTypes = function() {
+ c("Supervised Classification", "Supervised Regression", "Survival Analysis", "Multilabel")
+}
+
diff --git a/R/convertOMLSplitsToMlr3.R b/R/convertOMLSplitsToMlr3.R
@@ -0,0 +1,24 @@
+convertOMLSplitsToMlr3 = function(estim.proc, mlr.task, predict = "both") {
+ type = estim.proc$type
+ n.repeats = estim.proc$parameters[["number_repeats"]]
+ n.folds = estim.proc$parameters[["number_folds"]]
+ percentage = as.numeric(estim.proc$parameters[["percentage"]])
+ data.splits = estim.proc$data.splits
+ stratified = estim.proc$parameters[["stratified_sampling"]]
+ stratified = ifelse(is.null(stratified), FALSE, stratified == "true")
+
+ if (type == "crossvalidation") {
+ if (n.repeats == 1L)
+ mlr.rdesc = mlr3::rsmp("cv", folds = n.folds, stratify = stratified)
+ else
+ mlr.rdesc = mlr3::rsmp("repeated_cv", reps = n.repeats, folds = n.folds, stratify = stratified)
+ mlr.rin = mlr.rdesc$instantiate(mlr.task)
+ } else if (type == "holdout") {
+ mlr.rdesc = mlr3::rsmp("holdout")
+ mlr.rin = mlr.rdesc$instantiate(task = mlr.task)
+ n.folds = 1
+ } else {
+ stopf("Unsupported estimation procedure type: %s", type)
+ }
+ return(mlr.rin)
+}
diff --git a/inst/examples/convertOMLDataSetToMlr3.R b/inst/examples/convertOMLDataSetToMlr3.R
@@ -0,0 +1,5 @@
+# \dontrun{
+# library("mlr3")
+# autosOML = getOMLDataSet(data.id = 9)
+# autosMlr3 = convertOMLDataSetToMlr3(autosOML)
+# }
diff --git a/tests/testthat/test_local_convertOMLDataSetToMlr3.R b/tests/testthat/test_local_convertOMLDataSetToMlr3.R
@@ -0,0 +1,45 @@
+context("convertOMLDataSetToMlr3")
+
+test_that("convertOMLDataSetToMlr3", {
+ with_test_cache({
+ ds = getOMLDataSet(10)
+
+ expect_is_mlr_task = function(mlr.task, ds) {
+ expect_equal(mlr.task$task_type, "classif")
+ expect_equal(mlr.task$nrow, nrow(ds$data))
+ expect_equal(ds$desc$default.target.attribute, mlr.task$target_names)
+ }
+
+ # now create the task
+ mlr.task = convertOMLDataSetToMlr3(ds)
+ expect_equal(mlr.task$task_type, "classif")
+
+ # now modify dataset by hand (no more server calls) to check
+ # ignore attributes stuff:
+ # Define the first two attributes as ignored attributes
+ ds$desc$ignore.attribute = colnames(ds$data[, 1:2])
+
+ mlr.task = convertOMLDataSetToMlr3(ds, ignore.flagged.attributes = TRUE)
+ expect_is_mlr_task(mlr.task, ds)
+ # we removed two attributes (and the target column is not considered here)
+ #expect_equal(sum(mlr.task$task.desc$n.feat), ncol(ds$data) - 3L)
+ expect_equal(mlr.task$ncol, ncol(ds$data) - 2L)
+
+ # pass faulty parameters
+ expect_error(convertOMLDataSetToMlr3(ds, task.type = "Nonexistent task type"), "element of")
+
+ # check setting mlr task id
+ expect_equal(convertOMLDataSetToMlr3(ds)$id, ds$desc$name)
+ expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "<oml.data.name>.<oml.data.id>")$id,
+ sprintf("%s.%s", ds$desc$name, ds$desc$id))
+ expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "test")$id, "test")
+ expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "<oml.data.id>")$id, as.character(ds$desc$id))
+ expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "<oml.data.name>")$id, as.character(ds$desc$name))
+ expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "<oml.data.version>")$id, as.character(ds$desc$version))
+ expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "<oml.task.id>")$id, "<oml.task.id>")
+
+ # check if conversion to regression task works
+ ds$desc$target.features = ds$desc$default.target.attribute = "no_of_nodes_in"
+ expect_equal(convertOMLDataSetToMlr3(ds)$task_type, "regr")
+ })
+})
diff --git a/tests/testthat/test_local_convertOMLSplitsToMlr3.R b/tests/testthat/test_local_convertOMLSplitsToMlr3.R
@@ -0,0 +1,25 @@
+context("convertOMLSplitsToMlr3")
+
+test_that("convertOMLSplitsToMlr3", {
+ with_test_cache({
+ task = getOMLTask(59)
+ mlr.task = convertOMLTaskToMlr3(task)$mlr.task
+
+ oml.types = c("crossvalidation", "holdout")
+ mlr.types = c("cv", "holdout")
+
+ for (i in seq_along(oml.types)) {
+ task$input$estimation.procedure$type = oml.types[i]
+ if (oml.types[i] == "holdout") {
+ task$input$estimation.procedure$parameters$percentage = "50"
+ }
+ splits = convertOMLSplitsToMlr3(task$input$estimation.procedure, mlr.task)
+ expect_is(splits, "Resampling")
+ expect_equal(splits$id, mlr.types[i])
+ }
+
+ # pass invalid estim.proc
+ task$input$estimation.procedure$type = "blabla"
+ expect_error(convertOMLSplitsToMlr3(task$input$estimation.procedure, mlr.task), "Unsupported estimation procedure type: blabla")
+ })
+})