I have used the mlr and batchtools to benchmark openml datasets for 2 learners namely rpart and logistic regression.
rm(list = ls())
detectCores(all.tests = FALSE, logical = TRUE)
library(mlr)
library(batchtools)
library(OpenML)
library(randomForest)
library(classif.rpart)
saveOMLConfig(apikey = "12869a02b52f8dff259ff4f34a3dcbc6", arff.reader = "RWeka", overwrite=TRUE)
setBatchtoolsExperiment = function(seed = 1, ncpus = 3,
clas_used,
nameExperiment = paste("Data/Results/Batchtools/batchtool_experiment")) {
# which subset of dataset
omldatasets = clas_used$data.id
unlink(nameExperiment, recursive = TRUE)
regis = makeExperimentRegistry(nameExperiment, seed = seed,
packages = c("mlr", "OpenML", "methods"),
#source = paste0(dir, "/benchmark_defs.R"),
work.dir = paste0("Data/Results/Batchtools"),
#conf.file = paste0("Data/Results/Batchtools/.batchtools.conf.R")
)
regis$cluster.functions = makeClusterFunctionsSocket(ncpus = ncpus)
# add selected OML datasets as problems
for (did in omldatasets) {
data = list(did = did)
addProblem(name = as.character(did), data = data)
}
# add one generic 'algo' that compares RF and LR
addAlgorithm("eval", fun = function(job, data, instance, ...) {
par.vals = list(...)
# tryCatch({
# get the dataset
omldataset = getOMLDataSet(data$did)
if (identical(omldataset$target.features, character(0))) {
omldataset$target.features="Class"
omldataset$desc$default.target.attribute="Class"
}
task = convertOMLDataSetToMlr(omldataset)
# learners
lrn.classif.lr = makeLearner("classif.logreg", predict.type = "prob", fix.factors.prediction = TRUE)
lrn.classif.rpart = makeLearner("classif.rpart", predict.type = "prob", fix.factors.prediction = TRUE)
lrn.list = list(lrn.classif.lr,lrn.classif.rpart)
# measures
measures = list(acc, brier, auc, timetrain, fpr, fnr)
rdesc = makeResampleDesc("RepCV", folds = 5, reps = 10, stratify = TRUE)
configureMlr(on.learner.error = "warn", show.learner.output = TRUE)
bmr = benchmark(lrn.list, task, rdesc, measures, keep.pred = FALSE, models = FALSE, show.info = TRUE)
bmr
#}, error = function(e) return(paste0("The variable '", data$did, "'",
# " caused the error: '", e, "'")))
})
# finalize experiment
# set.seed(1)
ades = data.frame(c(1))
addExperiments(algo.designs = list(eval = ades))
summarizeExperiments()
getStatus()
}
This resulted into regis. I saw that inorder to generate ROC curve for my prediction I have to call generateThreshVsPerfData() and plotROCCurves() on the benchmark result (BenchmarkResult()). To generateThreshVsPerfData(), I have to pass either benchmark result (i.e bmr) or prediction. Because I cannot call get retrieve bmr from the regis object, I used reduceResultsList() inside generateThreshVsPerfData()
#retrieve benchmark result
result = reduceResultsList(ids = c(c(1:284), c(286:318)), reg = regis, missing.val = NA)
Reducing [===================================================================================================>] 100% eta: 0s>
> df = generateThreshVsPerfData(result, measures = list(fpr, tpr, mmce))
Error in generateThreshVsPerfData.list(result, measures = list(fpr, tpr, :
Assertion on 'obj' failed: May only contain the following types: Prediction,ResampleResult.
#method 2
> # Extract predictions
> preds = getBMRPredictions(result, drop = TRUE)
Error in getBMRPredictions(result, drop = TRUE) :
Assertion on 'bmr' failed: Must inherit from class 'BenchmarkResult', but has class 'list'.
>
> # Change the class attribute
> preds2 = lapply(preds, function(x) {class(x) = "Prediction"; return(x)})
Error in lapply(preds, function(x) { : object 'preds' not found
>
> # Draw ROC curves
> df = generateThreshVsPerfData(preds2, measures = list(fpr, tpr, mmce))
Error in generateThreshVsPerfData(preds2, measures = list(fpr, tpr, mmce)) :
object 'preds2' not found
> plotROCCurves(df)
Error in plotROCCurves(df) :
Assertion on 'obj' failed: Must inherit from class 'ThreshVsPerfData', but has class 'function'.
Please, can anyone be able to recommend how can i generate roc curve for benchmarked result. I have already benchmarked and got the result but in the form of regis object which i cannot use with generateThreshVsPerfData(). Hence, I have to use prediction results. Inorder to generate the prediction result, can I train the same model with the same datesets after benchmarking and use predict() on top of the model. Is it advsiable to do that?