fixes problem with feature importance caused by svm

christophM · May 22, 2018 · 8af7a3f · 8af7a3f
1 parent a241056
commit 8af7a3f
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 9 deletions.
diff --git a/R/get-cervical-cancer-dataset.R b/R/get-cervical-cancer-dataset.R
@@ -22,6 +22,7 @@ get.cervical.data = function(data_dir){
 
  cervical_impute = mlr::impute(cervical, classes = list(numeric = imputeMode()))
  cervical = cervical_impute$data
+ cervical = relevel(cervical, "Healthy")
  cervical
 }
 

diff --git a/manuscript/05.5-agnostic-permfeatimp.Rmd b/manuscript/05.5-agnostic-permfeatimp.Rmd
@@ -40,19 +40,19 @@ We show examples for classification and regression.
 
 **Cervical cancer (Classification)**
 
-We fit a support vector machine model to predict [cervical cancer](#cervical).
+We fit a random forest model to predict [cervical cancer](#cervical).
 We measure the error increase by: $1-AUC$ (one minus the area under the ROC curve).
 Features that are associated model error increase by a factor of 1 (= no change) were not important for predicting cervical cancer.
 
-```{r importance-cervical, fig.cap = "The importance for each of the features in predicting cervical cancer with a support vector machine model. The importance is the factor by which the error is increased compared to the original model error."}
+```{r importance-cervical, fig.cap = "The importance for each of the features in predicting cervical cancer with a random forest. The importance is the factor by which the error is increased compared to the original model error."}
 library('mlr')
 library('iml')
 data("cervical")
-task = makeClassifTask(data = cervical, target = "Biopsy")
-learner = makeLearner('classif.svm', predict.type = 'prob')
+task = makeClassifTask(data = cervical, target = "Biopsy", positive = "Cancer")
+learner = makeLearner('classif.randomForest', predict.type = 'prob')
 mod = mlr::train(learner, task)
-predictor = Predictor$new(mod, data = cervical[-which(names(cervical) == "Biopsy")], y = (cervical$Biopsy == "Cancer"), class = 1)
-auc_error = function(actual, predicted) 1 - Metrics::auc(actual, predicted )
+predictor = Predictor$new(mod, data = cervical[-which(names(cervical) == "Biopsy")], y = (cervical$Biopsy == "Cancer"), class = "Cancer")
+auc_error = function(actual, predicted) 1 - Metrics::auc(actual, predicted)
 importance = FeatureImp$new(predictor, loss = auc_error)
 imp.dat = importance$results[c("feature", "permutation.error", "importance")]
 plot(importance)
@@ -63,13 +63,13 @@ The feature with the highest importance was `r imp.dat[1, '..feature']` associat
 
 **Bike rentals (Regression)**
 
-We fit a random forest model to predict [bike rentals](#bike-data), given weather conditions and calendric information.
+We fit a support vector machine model to predict [bike rentals](#bike-data), given weather conditions and calendric information.
 As error measurement we use the mean absolute error.
 
-```{r importance-bike, fig.cap = "The importance for each of the features in predicting bike rentals with a random forest."}
+```{r importance-bike, fig.cap = "The importance for each of the features in predicting bike rentals with a support vector machine."}
 data("bike")
 task = makeRegrTask(data = bike, target = "cnt")
-learner = makeLearner('regr.randomForest')
+learner = makeLearner('regr.svm')
 mod = mlr::train(learner, task)
 predictor = Predictor$new(mod, data = bike[-which(names(bike) == "cnt")], y = bike$cnt)
 importance = FeatureImp$new(predictor, loss = 'mae')