Skip to content

Commit

Permalink
add an option so that only the xml is downloaded (#347)
Browse files Browse the repository at this point in the history
* add an option so that only the xml is downloaded

* redownload run if prediction not available
  • Loading branch information
giuseppec committed Jun 9, 2017
1 parent 645761d commit 0708f97
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 71 deletions.
4 changes: 4 additions & 0 deletions R/convertOMLRunToBMR.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ convertOMLRunToBMR = function(run, measures = run$task.evaluation.measure, recom
task.desc = getTaskDesc(task$mlr.task)

pred = run$predictions
if (is.null(pred)) {
run = getOMLRun(run$run.id, only.xml = FALSE) # download run again
pred = run$predictions
}
if (min(pred$fold) == 0)
pred$fold = (pred$fold + 1)
if (min(pred[,"repeat"]) == 0)
Expand Down
36 changes: 20 additions & 16 deletions R/downloadOMLObject.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
#' Should files that are already in cache be overwritten?
#' Default is \code{FALSE}.
#' @template arg_cache_only
#' @param only.xml [\code{logical(1)}]\cr
#' Should only the XML be downloaded?
#' @template arg_verbosity
#' @keywords internal
#' @return [list]
downloadOMLObject = function(id, object = c("data", "task", "flow", "run"), overwrite = FALSE, cache.only = FALSE, verbosity = NULL) {
downloadOMLObject = function(id, object = c("data", "task", "flow", "run"), overwrite = FALSE, cache.only = FALSE, only.xml = FALSE, verbosity = NULL) {
id = asCount(id)
assertChoice(object, choices = c("data", "task", "flow", "run"))

Expand Down Expand Up @@ -57,7 +59,6 @@ downloadOMLObject = function(id, object = c("data", "task", "flow", "run"), over
# stop(doc)
# }

## now download files
# get url of files
if (object == "data") {
url = xmlRValS(doc, "/oml:data_set_description/oml:url")
Expand Down Expand Up @@ -96,22 +97,25 @@ downloadOMLObject = function(id, object = c("data", "task", "flow", "run"), over
url = url[url != ""]
}

# download files if there is an url
if (!is.null(url) & length(url) != 0) {
if (f[[file.ind]]$found & !overwrite) {
showInfo(verbosity, sprintf("%s '%i' file '%s' found in cache.", cap.obj, id, basename(f[[file.ind]]$path)))
} else {
url = stri_trim_both(url)
showInfo(verbosity, "Downloading from '%s' to '%s'", url, f[[file.ind]]$path)
resp = GET(url)
content.resp = content(resp, as = "raw")
if (is.vector(content.resp))
writeBin(content.resp, f[[file.ind]]$path) else
warningf("File not found at '%s'.", url)
# set found = TRUE if downloaded file is in cache
if (file.exists(f[[file.ind]]$path)) f[[file.ind]]$found = TRUE
if (!only.xml) {
# download files if there is an url
if (!is.null(url) & length(url) != 0) {
if (f[[file.ind]]$found & !overwrite) {
showInfo(verbosity, sprintf("%s '%i' file '%s' found in cache.", cap.obj, id, basename(f[[file.ind]]$path)))
} else {
url = stri_trim_both(url)
showInfo(verbosity, "Downloading from '%s' to '%s'", url, f[[file.ind]]$path)
resp = GET(url)
content.resp = content(resp, as = "raw")
if (is.vector(content.resp))
writeBin(content.resp, f[[file.ind]]$path) else
warningf("File not found at '%s'.", url)
# set found = TRUE if downloaded file is in cache
if (file.exists(f[[file.ind]]$path)) f[[file.ind]]$found = TRUE
}
}
}

return(list(doc = doc, files = f))
}

Expand Down
108 changes: 55 additions & 53 deletions R/getOMLRun.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,68 +7,22 @@
#' @param run.id [\code{integer(1)}]\cr
#' The run ID.
#' @template arg_cache_only
#' @param only.xml [\code{logical(1)}]\cr
#' Should only the XML be downloaded?
#' @template arg_verbosity
#' @return [\code{\link{OMLRun}}].
#' @family downloading functions
#' @family run-related functions
#' @example inst/examples/getOMLRun.R
#' @export
getOMLRun = function(run.id, cache.only = FALSE, verbosity = NULL) {
getOMLRun = function(run.id, cache.only = FALSE, only.xml = FALSE, verbosity = NULL) {
id = asCount(run.id)
assertFlag(cache.only)

down = downloadOMLObject(id, object = "run", cache.only = cache.only, verbosity = verbosity)
down = downloadOMLObject(id, object = "run", cache.only = cache.only, only.xml = only.xml, verbosity = verbosity)
f = down$files
doc = down$doc

parseData = function(path) {
# parse datasets
path.ds = paste(path, "oml:dataset", sep ="/")
ns.datasets = getNodeSet(doc, path.ds)
datasets = lapply(seq_along(ns.datasets), function(i) {
list(
data.id = xmlRValR(doc, paste(path.ds, "[", i, "]/oml:did", sep = '')),
name = xmlRValS(doc, paste(path.ds, "[", i, "]/oml:name", sep = '')),
url = xmlRValS(doc, paste(path.ds, "[", i, "]/oml:url", sep = ''))
)})
datasets = convertListOfRowsToDataFrame(datasets, strings.as.factors = FALSE)

# parse files
path.fls = paste(path, "oml:file", sep ="/")
ns.fls = getNodeSet(doc, path.fls)
files = lapply(seq_along(ns.fls), function(i) {
list(
data.id = xmlRValR(doc, paste(path.fls, "[", i, "]/oml:did", sep='')),
name = xmlRValS(doc, paste(path.fls, "[", i, "]/oml:name", sep='')),
url = xmlRValS(doc, paste(path.fls, "[", i, "]/oml:url", sep=''))
)})
files = convertListOfRowsToDataFrame(files, strings.as.factors = FALSE)

# parse evaluations
path.evals = paste(path, "oml:evaluation", sep ="/")
ns.evals = getNodeSet(doc, path.evals)

evals = setDF(rbindlist(lapply(ns.evals, function(node) {
children = xmlChildren(node)
row = list(
as.integer(xmlValue(children[["did"]])),
xmlValue(children[["name"]]),
xmlValue(children[["flow_id"]]),
xmlValue(children[["label"]]),
as.numeric(xmlValue(children[["value"]])),
as.numeric(xmlValue(children[["stdev"]])),
xmlValue(children[["array_data"]]),
as.integer(xmlValue(children[["sample_size"]]))
)
cv.info = xmlAttrs(node)[c("repeat", "fold")]
if (is.null(cv.info)) cv.info = c(NA, NA)
row = c(row, cv.info)
names(row) = c("data.id", "name", "flow_id", "label", "value", "stdev", "array.data", "sample.size", "repeat", "fold")
row
}), fill = TRUE))
makeOMLIOData(datasets = datasets, files = files, evaluations = evals)
}

run.args = filterNull(list(
run.id = xmlREValI(doc, "/oml:run/oml:run_id"),
uploader = xmlREValI(doc, "/oml:run/oml:uploader"),
Expand All @@ -82,8 +36,8 @@ getOMLRun = function(run.id, cache.only = FALSE, verbosity = NULL) {
setup.string = xmlOValS(doc, "/oml:run/oml:setup_string"),
error.message = xmlOValS(doc, "/oml:run/oml:error_message"),
tags = xmlOValsMultNsS(doc, "/oml:run/oml:tag"),
input.data = parseData("/oml:run/oml:input_data"),
output.data = parseData("/oml:run/oml:output_data"),
input.data = parseData(doc, "/oml:run/oml:input_data"),
output.data = parseData(doc, "/oml:run/oml:output_data"),
parameter.setting = list()
))

Expand All @@ -105,7 +59,7 @@ getOMLRun = function(run.id, cache.only = FALSE, verbosity = NULL) {
f = findCachedRun(run.args$run.id)

if (!f$predictions.arff$found) {
message("No URL found to retrieve predictions from.")
showInfo(verbosity, "No ARFF file containing the predictions found.")
pred = NULL
} else {
#showInfo(verbosity, "Predictions found in cache.")
Expand All @@ -115,3 +69,51 @@ getOMLRun = function(run.id, cache.only = FALSE, verbosity = NULL) {

return(do.call(makeOMLRun, run.args))
}

parseData = function(doc, path) {
# parse datasets
path.ds = paste(path, "oml:dataset", sep = "/")
ns.datasets = getNodeSet(doc, path.ds)
datasets = lapply(seq_along(ns.datasets), function(i) {
list(
data.id = xmlRValR(doc, paste(path.ds, "[", i, "]/oml:did", sep = '')),
name = xmlRValS(doc, paste(path.ds, "[", i, "]/oml:name", sep = '')),
url = xmlRValS(doc, paste(path.ds, "[", i, "]/oml:url", sep = ''))
)})
datasets = convertListOfRowsToDataFrame(datasets, strings.as.factors = FALSE)

# parse files
path.fls = paste(path, "oml:file", sep = "/")
ns.fls = getNodeSet(doc, path.fls)
files = lapply(seq_along(ns.fls), function(i) {
list(
data.id = xmlRValR(doc, paste(path.fls, "[", i, "]/oml:did", sep = '')),
name = xmlRValS(doc, paste(path.fls, "[", i, "]/oml:name", sep = '')),
url = xmlRValS(doc, paste(path.fls, "[", i, "]/oml:url", sep = ''))
)})
files = convertListOfRowsToDataFrame(files, strings.as.factors = FALSE)

# parse evaluations
path.evals = paste(path, "oml:evaluation", sep = "/")
ns.evals = getNodeSet(doc, path.evals)

evals = setDF(rbindlist(lapply(ns.evals, function(node) {
children = xmlChildren(node)
row = list(
as.integer(xmlValue(children[["did"]])),
xmlValue(children[["name"]]),
xmlValue(children[["flow_id"]]),
xmlValue(children[["label"]]),
as.numeric(xmlValue(children[["value"]])),
as.numeric(xmlValue(children[["stdev"]])),
xmlValue(children[["array_data"]]),
as.integer(xmlValue(children[["sample_size"]]))
)
cv.info = xmlAttrs(node)[c("repeat", "fold")]
if (is.null(cv.info)) cv.info = c(NA, NA)
row = c(row, cv.info)
names(row) = c("data.id", "name", "flow_id", "label", "value", "stdev", "array.data", "sample.size", "repeat", "fold")
row
}), fill = TRUE))
makeOMLIOData(datasets = datasets, files = files, evaluations = evals)
}
6 changes: 5 additions & 1 deletion man/downloadOMLObject.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/getOMLRun.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 0708f97

Please sign in to comment.