diff --git a/DESCRIPTION b/DESCRIPTION index 14fdb7c..0057590 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,7 +30,9 @@ Suggests: rmarkdown, R.rsp, lintr (>= 1.0.1), - rex + rex, + mlr3, + mlr3survival Imports: backports (>= 1.1.0), BBmisc (>= 1.11), diff --git a/NAMESPACE b/NAMESPACE index 4d7ad6c..3f7487c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,10 +31,12 @@ export(clearOMLCache) export(convertMlrLearnerToOMLFlow) export(convertMlrTaskToOMLDataSet) export(convertOMLDataSetToMlr) +export(convertOMLDataSetToMlr3) export(convertOMLFlowToMlr) export(convertOMLMlrRunToBMR) export(convertOMLRunToBMR) export(convertOMLTaskToMlr) +export(convertOMLTaskToMlr3) export(deleteOMLObject) export(extractOMLStudyIds) export(getCachedOMLDataSetStatus) diff --git a/R/convertOMLDataSetToMlr3.R b/R/convertOMLDataSetToMlr3.R new file mode 100644 index 0000000..6e2d154 --- /dev/null +++ b/R/convertOMLDataSetToMlr3.R @@ -0,0 +1,123 @@ +#' @title Convert an OpenML data set to mlr3 task. +#' +#' @description +#' Converts an \code{\link{OMLDataSet}} to a \code{\link[mlr3]{Task}}. +#' +#' @param obj [\code{\link{OMLDataSet}}]\cr +#' The object that should be converted. +#' @param mlr.task.id [\code{character(1)}]\cr +#' Id string for \code{\link[mlr3]{Task}} object. +#' The strings \code{}, \code{} and \code{} +#' will be replaced by their respective values contained in the \code{\link{OMLDataSet}} object. +#' Default is \code{}. +#' @param task.type [\code{character(1)}]\cr +#' As we only pass the data set, we need to define the task type manually. +#' Possible are: \dQuote{Supervised Classification}, \dQuote{Supervised Regression}, +#' \dQuote{Survival Analysis}. +#' Default is \code{NULL} which means to guess it from the target column in the +#' data set. If that is a factor or a logical, we choose classification. +#' If it is numeric we choose regression. In all other cases an error is thrown. +#' @param target [\code{character}]\cr +#' The target for the classification/regression task. +#' Default is the \code{default.target.attribute} of the \code{\link{OMLDataSetDescription}}. +#' @param ignore.flagged.attributes [\code{logical(1)}]\cr +#' Should those features that are listed in the data set description slot \dQuote{ignore.attribute} +#' be removed? +#' Default is \code{TRUE}. +#' @param drop.levels [\code{logical(1)}]\cr +#' Should empty factor levels be dropped in the data? +#' Default is \code{TRUE}. +#' @param fix.colnames [\code{logical(1)}]\cr +#' Should colnames of the data be fixed using \code{\link[base]{make.names}}? +#' Default is \code{TRUE}. +#' @template arg_verbosity +#' @return [\code{\link[mlr3]{Task}}]. +#' @family data set-related functions +#' @example /inst/examples/convertOMLDataSetToMlr3.R +#' @export +convertOMLDataSetToMlr3 = function( + obj, + mlr.task.id = "", + task.type = NULL, + target = obj$desc$default.target.attribute, + ignore.flagged.attributes = TRUE, + drop.levels = TRUE, + fix.colnames = TRUE, + verbosity = NULL) { + + assertClass(obj, "OMLDataSet") + assertSubset(target, obj$colnames.new) + assertFlag(ignore.flagged.attributes) + assertFlag(drop.levels) + + data = obj$data + desc = obj$desc + + # no task type? guess it by looking at target + if (is.null(task.type)) + task.type = guessTaskType(data[, target]) + assertChoice(task.type, getValidTaskTypes()) + + # remove ignored attributes from data + if (any(!is.na(desc$ignore.attribute)) & ignore.flagged.attributes) { + keep.cols = obj$colnames.old %nin% desc$ignore.attribute + data = data[, keep.cols, drop = FALSE] + } + + # drop levels + if (drop.levels) + data = droplevels(data) + + # fix colnames using make.names + if (fix.colnames) { + colnames(data) = make.names(colnames(data), unique = TRUE) + target = make.names(target, unique = TRUE) + } + + # get fixup verbose setting for mlr + if (is.null(verbosity)) + verbosity = getOMLConfig()$verbosity + fixup = ifelse(verbosity == 0L, "quiet", "warn") + + mlr.task = switch(task.type, + "Supervised Classification" = mlr3::TaskClassif$new(id = desc$name, backend = data, target = target), + "Supervised Regression" = mlr3::TaskRegr$new(id = desc$name, backend = data, target = target), + "Survival Analysis" = mlr3survival::TaskSurv$new(id = desc$name, backend = data, target = target), + stopf("Encountered currently unsupported task type: %s", task.type) + ) + + if (!is.null(mlr.task.id)) + mlr.task$id = replaceOMLDataSetString(mlr.task.id, obj) + + return(mlr.task) +} + +replaceOMLDataSetString = function(string, data.set) { + string = stri_replace_all_fixed(string, "", data.set$desc$id) + string = stri_replace_all_fixed(string, "", data.set$desc$name) + stri_replace_all_fixed(string, "", data.set$desc$version) +} + +# @title Helper to guess task type from target column format. +# +# @param target [vector] +# Vector of target values. +# @return [character(1)] +guessTaskType = function(target) { + if (inherits(target, "data.frame")) { + assertDataFrame(target, types = "logical") + return("Multilabel") + } else { + if (is.factor(target) | is.logical(target)) + return("Supervised Classification") + if (is.numeric(target)) + return("Supervised Regression") + } + + stopf("Cannot guess task.type from data!") +} + +getValidTaskTypes = function() { + c("Supervised Classification", "Supervised Regression", "Survival Analysis", "Multilabel") +} + diff --git a/R/convertOMLSplitsToMlr3.R b/R/convertOMLSplitsToMlr3.R new file mode 100644 index 0000000..d64823d --- /dev/null +++ b/R/convertOMLSplitsToMlr3.R @@ -0,0 +1,24 @@ +convertOMLSplitsToMlr3 = function(estim.proc, mlr.task, predict = "both") { + type = estim.proc$type + n.repeats = estim.proc$parameters[["number_repeats"]] + n.folds = estim.proc$parameters[["number_folds"]] + percentage = as.numeric(estim.proc$parameters[["percentage"]]) + data.splits = estim.proc$data.splits + stratified = estim.proc$parameters[["stratified_sampling"]] + stratified = ifelse(is.null(stratified), FALSE, stratified == "true") + + if (type == "crossvalidation") { + if (n.repeats == 1L) + mlr.rdesc = mlr3::rsmp("cv", folds = n.folds, stratify = stratified) + else + mlr.rdesc = mlr3::rsmp("repeated_cv", reps = n.repeats, folds = n.folds, stratify = stratified) + mlr.rin = mlr.rdesc$instantiate(mlr.task) + } else if (type == "holdout") { + mlr.rdesc = mlr3::rsmp("holdout") + mlr.rin = mlr.rdesc$instantiate(task = mlr.task) + n.folds = 1 + } else { + stopf("Unsupported estimation procedure type: %s", type) + } + return(mlr.rin) +} diff --git a/inst/examples/convertOMLDataSetToMlr3.R b/inst/examples/convertOMLDataSetToMlr3.R new file mode 100644 index 0000000..b81efe1 --- /dev/null +++ b/inst/examples/convertOMLDataSetToMlr3.R @@ -0,0 +1,5 @@ +# \dontrun{ +# library("mlr3") +# autosOML = getOMLDataSet(data.id = 9) +# autosMlr3 = convertOMLDataSetToMlr3(autosOML) +# } diff --git a/tests/testthat/test_local_convertOMLDataSetToMlr3.R b/tests/testthat/test_local_convertOMLDataSetToMlr3.R new file mode 100644 index 0000000..c591efa --- /dev/null +++ b/tests/testthat/test_local_convertOMLDataSetToMlr3.R @@ -0,0 +1,45 @@ +context("convertOMLDataSetToMlr3") + +test_that("convertOMLDataSetToMlr3", { + with_test_cache({ + ds = getOMLDataSet(10) + + expect_is_mlr_task = function(mlr.task, ds) { + expect_equal(mlr.task$task_type, "classif") + expect_equal(mlr.task$nrow, nrow(ds$data)) + expect_equal(ds$desc$default.target.attribute, mlr.task$target_names) + } + + # now create the task + mlr.task = convertOMLDataSetToMlr3(ds) + expect_equal(mlr.task$task_type, "classif") + + # now modify dataset by hand (no more server calls) to check + # ignore attributes stuff: + # Define the first two attributes as ignored attributes + ds$desc$ignore.attribute = colnames(ds$data[, 1:2]) + + mlr.task = convertOMLDataSetToMlr3(ds, ignore.flagged.attributes = TRUE) + expect_is_mlr_task(mlr.task, ds) + # we removed two attributes (and the target column is not considered here) + #expect_equal(sum(mlr.task$task.desc$n.feat), ncol(ds$data) - 3L) + expect_equal(mlr.task$ncol, ncol(ds$data) - 2L) + + # pass faulty parameters + expect_error(convertOMLDataSetToMlr3(ds, task.type = "Nonexistent task type"), "element of") + + # check setting mlr task id + expect_equal(convertOMLDataSetToMlr3(ds)$id, ds$desc$name) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = ".")$id, + sprintf("%s.%s", ds$desc$name, ds$desc$id)) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "test")$id, "test") + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "")$id, as.character(ds$desc$id)) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "")$id, as.character(ds$desc$name)) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "")$id, as.character(ds$desc$version)) + expect_equal(convertOMLDataSetToMlr3(ds, mlr.task.id = "")$id, "") + + # check if conversion to regression task works + ds$desc$target.features = ds$desc$default.target.attribute = "no_of_nodes_in" + expect_equal(convertOMLDataSetToMlr3(ds)$task_type, "regr") + }) +}) diff --git a/tests/testthat/test_local_convertOMLSplitsToMlr3.R b/tests/testthat/test_local_convertOMLSplitsToMlr3.R new file mode 100644 index 0000000..98e713a --- /dev/null +++ b/tests/testthat/test_local_convertOMLSplitsToMlr3.R @@ -0,0 +1,25 @@ +context("convertOMLSplitsToMlr3") + +test_that("convertOMLSplitsToMlr3", { + with_test_cache({ + task = getOMLTask(59) + mlr.task = convertOMLTaskToMlr3(task)$mlr.task + + oml.types = c("crossvalidation", "holdout") + mlr.types = c("cv", "holdout") + + for (i in seq_along(oml.types)) { + task$input$estimation.procedure$type = oml.types[i] + if (oml.types[i] == "holdout") { + task$input$estimation.procedure$parameters$percentage = "50" + } + splits = convertOMLSplitsToMlr3(task$input$estimation.procedure, mlr.task) + expect_is(splits, "Resampling") + expect_equal(splits$id, mlr.types[i]) + } + + # pass invalid estim.proc + task$input$estimation.procedure$type = "blabla" + expect_error(convertOMLSplitsToMlr3(task$input$estimation.procedure, mlr.task), "Unsupported estimation procedure type: blabla") + }) +})