add an option so that only the xml is downloaded (#347)

* add an option so that only the xml is downloaded * redownload run if prediction not available
openml · Jun 9, 2017 · 0708f97 · 0708f97
1 parent 645761d
commit 0708f97
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 71 deletions.
diff --git a/R/convertOMLRunToBMR.R b/R/convertOMLRunToBMR.R
@@ -45,6 +45,10 @@ convertOMLRunToBMR = function(run, measures = run$task.evaluation.measure, recom
  task.desc = getTaskDesc(task$mlr.task)
 
  pred = run$predictions
+ if (is.null(pred)) {
+ run = getOMLRun(run$run.id, only.xml = FALSE) # download run again
+ pred = run$predictions
+ }
  if (min(pred$fold) == 0)
  pred$fold = (pred$fold + 1)
  if (min(pred[,"repeat"]) == 0)

diff --git a/R/downloadOMLObject.R b/R/downloadOMLObject.R
@@ -13,10 +13,12 @@
 #' Should files that are already in cache be overwritten?
 #' Default is \code{FALSE}.
 #' @template arg_cache_only
+#' @param only.xml [\code{logical(1)}]\cr
+#' Should only the XML be downloaded?
 #' @template arg_verbosity
 #' @keywords internal
 #' @return [list]
-downloadOMLObject = function(id, object = c("data", "task", "flow", "run"), overwrite = FALSE, cache.only = FALSE, verbosity = NULL) {
+downloadOMLObject = function(id, object = c("data", "task", "flow", "run"), overwrite = FALSE, cache.only = FALSE, only.xml = FALSE, verbosity = NULL) {
  id = asCount(id)
  assertChoice(object, choices = c("data", "task", "flow", "run"))
 
@@ -57,7 +59,6 @@ downloadOMLObject = function(id, object = c("data", "task", "flow", "run"), over
  # stop(doc)
  # }
 
- ## now download files
  # get url of files
  if (object == "data") {
  url = xmlRValS(doc, "/oml:data_set_description/oml:url")
@@ -96,22 +97,25 @@ downloadOMLObject = function(id, object = c("data", "task", "flow", "run"), over
  url = url[url != ""]
  }
 
- # download files if there is an url
- if (!is.null(url) & length(url) != 0) {
- if (f[[file.ind]]$found & !overwrite) {
- showInfo(verbosity, sprintf("%s '%i' file '%s' found in cache.", cap.obj, id, basename(f[[file.ind]]$path)))
- } else {
- url = stri_trim_both(url)
- showInfo(verbosity, "Downloading from '%s' to '%s'", url, f[[file.ind]]$path)
- resp = GET(url)
- content.resp = content(resp, as = "raw")
- if (is.vector(content.resp))
- writeBin(content.resp, f[[file.ind]]$path) else
- warningf("File not found at '%s'.", url)
- # set found = TRUE if downloaded file is in cache
- if (file.exists(f[[file.ind]]$path)) f[[file.ind]]$found = TRUE
+ if (!only.xml) {
+ # download files if there is an url
+ if (!is.null(url) & length(url) != 0) {
+ if (f[[file.ind]]$found & !overwrite) {
+ showInfo(verbosity, sprintf("%s '%i' file '%s' found in cache.", cap.obj, id, basename(f[[file.ind]]$path)))
+ } else {
+ url = stri_trim_both(url)
+ showInfo(verbosity, "Downloading from '%s' to '%s'", url, f[[file.ind]]$path)
+ resp = GET(url)
+ content.resp = content(resp, as = "raw")
+ if (is.vector(content.resp))
+ writeBin(content.resp, f[[file.ind]]$path) else
+ warningf("File not found at '%s'.", url)
+ # set found = TRUE if downloaded file is in cache
+ if (file.exists(f[[file.ind]]$path)) f[[file.ind]]$found = TRUE
+ }
  }
  }
+
  return(list(doc = doc, files = f))
 }
 

diff --git a/R/getOMLRun.R b/R/getOMLRun.R
@@ -7,68 +7,22 @@
 #' @param run.id [\code{integer(1)}]\cr
 #' The run ID.
 #' @template arg_cache_only
+#' @param only.xml [\code{logical(1)}]\cr
+#' Should only the XML be downloaded?
 #' @template arg_verbosity
 #' @return [\code{\link{OMLRun}}].
 #' @family downloading functions
 #' @family run-related functions
 #' @example inst/examples/getOMLRun.R
 #' @export
-getOMLRun = function(run.id, cache.only = FALSE, verbosity = NULL) {
+getOMLRun = function(run.id, cache.only = FALSE, only.xml = FALSE, verbosity = NULL) {
  id = asCount(run.id)
  assertFlag(cache.only)
 
- down = downloadOMLObject(id, object = "run", cache.only = cache.only, verbosity = verbosity)
+ down = downloadOMLObject(id, object = "run", cache.only = cache.only, only.xml = only.xml, verbosity = verbosity)
  f = down$files
  doc = down$doc
 
- parseData = function(path) {
- # parse datasets
- path.ds = paste(path, "oml:dataset", sep ="/")
- ns.datasets = getNodeSet(doc, path.ds)
- datasets = lapply(seq_along(ns.datasets), function(i) {
- list(
- data.id = xmlRValR(doc, paste(path.ds, "[", i, "]/oml:did", sep = '')),
- name = xmlRValS(doc, paste(path.ds, "[", i, "]/oml:name", sep = '')),
- url = xmlRValS(doc, paste(path.ds, "[", i, "]/oml:url", sep = ''))
- )})
- datasets = convertListOfRowsToDataFrame(datasets, strings.as.factors = FALSE)
-
- # parse files
- path.fls = paste(path, "oml:file", sep ="/")
- ns.fls = getNodeSet(doc, path.fls)
- files = lapply(seq_along(ns.fls), function(i) {
- list(
- data.id = xmlRValR(doc, paste(path.fls, "[", i, "]/oml:did", sep='')),
- name = xmlRValS(doc, paste(path.fls, "[", i, "]/oml:name", sep='')),
- url = xmlRValS(doc, paste(path.fls, "[", i, "]/oml:url", sep=''))
- )})
- files = convertListOfRowsToDataFrame(files, strings.as.factors = FALSE)
-
- # parse evaluations
- path.evals = paste(path, "oml:evaluation", sep ="/")
- ns.evals = getNodeSet(doc, path.evals)
-
- evals = setDF(rbindlist(lapply(ns.evals, function(node) {
- children = xmlChildren(node)
- row = list(
- as.integer(xmlValue(children[["did"]])),
- xmlValue(children[["name"]]),
- xmlValue(children[["flow_id"]]),
- xmlValue(children[["label"]]),
- as.numeric(xmlValue(children[["value"]])),
- as.numeric(xmlValue(children[["stdev"]])),
- xmlValue(children[["array_data"]]),
- as.integer(xmlValue(children[["sample_size"]]))
- )
- cv.info = xmlAttrs(node)[c("repeat", "fold")]
- if (is.null(cv.info)) cv.info = c(NA, NA)
- row = c(row, cv.info)
- names(row) = c("data.id", "name", "flow_id", "label", "value", "stdev", "array.data", "sample.size", "repeat", "fold")
- row
- }), fill = TRUE))
- makeOMLIOData(datasets = datasets, files = files, evaluations = evals)
- }
-
  run.args = filterNull(list(
  run.id = xmlREValI(doc, "/oml:run/oml:run_id"),
  uploader = xmlREValI(doc, "/oml:run/oml:uploader"),
@@ -82,8 +36,8 @@ getOMLRun = function(run.id, cache.only = FALSE, verbosity = NULL) {
  setup.string = xmlOValS(doc, "/oml:run/oml:setup_string"),
  error.message = xmlOValS(doc, "/oml:run/oml:error_message"),
  tags = xmlOValsMultNsS(doc, "/oml:run/oml:tag"),
- input.data = parseData("/oml:run/oml:input_data"),
- output.data = parseData("/oml:run/oml:output_data"),
+ input.data = parseData(doc, "/oml:run/oml:input_data"),
+ output.data = parseData(doc, "/oml:run/oml:output_data"),
  parameter.setting = list()
  ))
 
@@ -105,7 +59,7 @@ getOMLRun = function(run.id, cache.only = FALSE, verbosity = NULL) {
  f = findCachedRun(run.args$run.id)
 
  if (!f$predictions.arff$found) {
- message("No URL found to retrieve predictions from.")
+ showInfo(verbosity, "No ARFF file containing the predictions found.")
  pred = NULL
  } else {
  #showInfo(verbosity, "Predictions found in cache.")
@@ -115,3 +69,51 @@ getOMLRun = function(run.id, cache.only = FALSE, verbosity = NULL) {
 
  return(do.call(makeOMLRun, run.args))
 }
+
+parseData = function(doc, path) {
+ # parse datasets
+ path.ds = paste(path, "oml:dataset", sep = "/")
+ ns.datasets = getNodeSet(doc, path.ds)
+ datasets = lapply(seq_along(ns.datasets), function(i) {
+ list(
+ data.id = xmlRValR(doc, paste(path.ds, "[", i, "]/oml:did", sep = '')),
+ name = xmlRValS(doc, paste(path.ds, "[", i, "]/oml:name", sep = '')),
+ url = xmlRValS(doc, paste(path.ds, "[", i, "]/oml:url", sep = ''))
+ )})
+ datasets = convertListOfRowsToDataFrame(datasets, strings.as.factors = FALSE)
+
+ # parse files
+ path.fls = paste(path, "oml:file", sep = "/")
+ ns.fls = getNodeSet(doc, path.fls)
+ files = lapply(seq_along(ns.fls), function(i) {
+ list(
+ data.id = xmlRValR(doc, paste(path.fls, "[", i, "]/oml:did", sep = '')),
+ name = xmlRValS(doc, paste(path.fls, "[", i, "]/oml:name", sep = '')),
+ url = xmlRValS(doc, paste(path.fls, "[", i, "]/oml:url", sep = ''))
+ )})
+ files = convertListOfRowsToDataFrame(files, strings.as.factors = FALSE)
+
+ # parse evaluations
+ path.evals = paste(path, "oml:evaluation", sep = "/")
+ ns.evals = getNodeSet(doc, path.evals)
+
+ evals = setDF(rbindlist(lapply(ns.evals, function(node) {
+ children = xmlChildren(node)
+ row = list(
+ as.integer(xmlValue(children[["did"]])),
+ xmlValue(children[["name"]]),
+ xmlValue(children[["flow_id"]]),
+ xmlValue(children[["label"]]),
+ as.numeric(xmlValue(children[["value"]])),
+ as.numeric(xmlValue(children[["stdev"]])),
+ xmlValue(children[["array_data"]]),
+ as.integer(xmlValue(children[["sample_size"]]))
+ )
+ cv.info = xmlAttrs(node)[c("repeat", "fold")]
+ if (is.null(cv.info)) cv.info = c(NA, NA)
+ row = c(row, cv.info)
+ names(row) = c("data.id", "name", "flow_id", "label", "value", "stdev", "array.data", "sample.size", "repeat", "fold")
+ row
+ }), fill = TRUE))
+ makeOMLIOData(datasets = datasets, files = files, evaluations = evals)
+}
diff --git a/man/downloadOMLObject.Rd b/man/downloadOMLObject.Rd
diff --git a/man/getOMLRun.Rd b/man/getOMLRun.Rd