I am banging my head against the wall with this one. I have made predictions from a decision tree, my predictor variables are exactly the same dimensions in both the prediction object and my training data dataset, yet when I try to construct a Confusion Matrix I get the warning "Error in [.default
(data, , pos) : subscript out of bounds". I cannot seem to be able to work it out.
set.seed(123)
sample = sample.split(df_clean, SplitRatio = .75)
train1 = subset(df_clean, sample == TRUE)
test1 = subset(df_clean, sample == FALSE)
dim(train1)
dim(test1)
#training DT
set.seed(456)
dt <- rpart(my_label ~ activePower+activePowerDelta+reactivePower+voltage+
phase+transient8+transient10+harmonicDelta1+harmonicDelta2+
harmonicDelta8, data=train1, method = "class")
predictions_dt <- predict(dt, test1, type = "class")
confusionMatrix(predictions_dt, test1$my_label)
Both predictions_dt and test one are in the same format of 24,020 entries - factors with the same number of levels - eg. "+fridge, +fridge+microwave, +fridge+oven"
Thanks for your help!
adding output from checking levels:
str(df_clean$my_label)
df_clean$my_label <- as.factor(df_clean$my_label)
levels(df_clean$my_label)
levels(df_clean$my_label)[1]
levels(df_clean$my_label)
[1] """+fridge"
[3] "+fridge+kettle""+fridge+kettle+microwave"
[5] "+fridge+kettle+tumble_dryer+washer_dryer""+fridge+kettle+tumble_dryer+washer_dryer+microwave"
[7] "+fridge+kettle+washer_dryer""+fridge+kettle+washing_machine+washer_dryer"
[9] "+fridge+microwave""+fridge+shower"
[11] "+fridge+shower+kettle""+fridge+shower+tumble_dryer+washer_dryer"
[13] "+fridge+shower+washer_dryer""+fridge+shower+washing_machine+washer_dryer"
[15] "+fridge+tumble_dryer+washer_dryer""+fridge+tumble_dryer+washer_dryer+microwave"
[17] "+fridge+vacuum""+fridge+vacuum+tumble_dryer+washer_dryer"
[19] "+fridge+vacuum+washer_dryer""+fridge+vacuum+washing_machine+washer_dryer"
[21] "+fridge+washer_dryer""+fridge+washer_dryer+microwave"
[23] "+fridge+washing_machine+washer_dryer""+fridge+washing_machine+washer_dryer+microwave"
[25] "+kettle""+shower"
[27] "+tumble_dryer+washer_dryer""+washer_dryer"
Adding output from dput(head(df_clean))
dput(head(df_clean))
structure(list(id = c(74589930L, 74589012L, 74588101L, 74587582L,
74587236L, 74586372L), type = c(5L, 5L, 1L, 2L, 5L, 5L), activePower = c(78L,
80L, 77L, 43L, 143L, 146L), activePowerDelta = c(-2L, 1L, 32L,
-100L, -3L, -7L), reactivePower = c(-38L, -38L, -37L, -22L, 143L,
142L), voltage = c(223.389, 224.258, 225.127, 224.258, 223.389,
223.389), phase = c(-25.6, -25.3, -25.6, -27, 44.6, 43.9), transient7 = c(0.567,
0.562, 0.584, 0.282, 0.914, 0.924), transient9 = c(0.567, 0.562,
0.57, 0.29, 0.914, 0.924), transient10 = c(0.567, 0.562, 0.572,
0.282, 0.914, 0.924), harmonicDelta1 = c(90L, 21L, 235L, 1183L,
82L, 128L), harmonicDelta7 = c(127L, 64L, 77L, 14L, 39L, 36L),
harmonicDelta9 = c(148L, 85L, 62L, 4L, 41L, 42L), timestamp = c("2018-01-21 23:58:08+00:00",
"2018-01-21 23:55:28+00:00", "2018-01-21 23:52:46+00:00",
"2018-01-21 23:51:03+00:00", "2018-01-21 23:49:59+00:00",
"2018-01-21 23:47:19+00:00"), my_label = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("", "+fridge", "+fridge+kettle",
"+fridge+kettle+microwave", "+fridge+kettle+tumble_dryer+washer_dryer",
"+fridge+kettle+tumble_dryer+washer_dryer+microwave", "+fridge+kettle+washer_dryer",
"+fridge+kettle+washing_machine+washer_dryer", "+fridge+microwave",
"+fridge+shower", "+fridge+shower+kettle", "+fridge+shower+tumble_dryer+washer_dryer",
"+fridge+shower+washer_dryer", "+fridge+shower+washing_machine+washer_dryer",
"+fridge+tumble_dryer+washer_dryer", "+fridge+tumble_dryer+washer_dryer+microwave",
"+fridge+vacuum", "+fridge+vacuum+tumble_dryer+washer_dryer",
"+fridge+vacuum+washer_dryer", "+fridge+vacuum+washing_machine+washer_dryer",
"+fridge+washer_dryer", "+fridge+washer_dryer+microwave",
"+fridge+washing_machine+washer_dryer", "+fridge+washing_machine+washer_dryer+microwave",
"+kettle", "+shower", "+tumble_dryer+washer_dryer", "+washer_dryer"
), class = "factor")), row.names = c(NA, 6L), class = "data.frame")