passi preliminari

library(ROSE)

## Loaded ROSE 0.0-4

library(caTools)
library(rpart)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

#1. Un esempio con dati simulati Tratto dal package ROSE

Dati all’interno di una semisfera svuotata (hacide)

Si tratta di dati simulati che possono essere usati per illustrare l’effetto delle varie procedure di sovracampionamento della calsse rara, sottocampionamento della classe prevalente e generazione di dati sintetici.

I dati sono relativi a due classi rappresenatbili la prima come una sfera e la seconda come una semisfera svuotata che è collocata attorno alla prima.

Diamoci un’occhiata.

data(hacide)
table(hacide.train$cls)

## 
##   0   1 
## 980  20

plot(hacide.train$x1,hacide.train$x2,col=hacide.train$cls, pch=20)

Come primo passo proviamo a utilizzare un albero per prevedere la classe usando le due variabili continue. I dati sono già suddivisi in training set e test set.

Utilizziamo alcune funzioni di ROSE per valutare la qualità del classificatore.

Prima vediamo come funziona il calssificatore con dati sbilanciati.

treeimb <- rpart(cls ~ ., data = hacide.train)
pred.treeimb <- predict(treeimb, newdata = hacide.test)
accuracy.meas(hacide.test$cls, pred.treeimb[,2])

## 
## Call: 
## accuracy.meas(response = hacide.test$cls, predicted = pred.treeimb[, 
##     2])
## 
## Examples are labelled as positive when predicted is greater than 0.5 
## 
## precision: 1.000
## recall: 0.200
## F: 0.167

roc.curve(hacide.test$cls, pred.treeimb[,2], plotit = FALSE)

## Area under the curve (AUC): 0.600

Sovracampionamentoe sottocampionamento

Proviamo ora a vedere cos?samplea accade se bilanciamo il campione sovracampionando la classe rara o sottocampionando la prevalente

sovracampionamento (bootstrap)

data.bal.ov <- ovun.sample(cls ~ ., data = hacide.train, method = "over",
                           p = 0.5, seed = 1)$data
table(data.bal.ov$cls)

## 
##   0   1 
## 980 941

###undersampling (selezione casuale)

data.bal.un <- ovun.sample(cls ~ ., data = hacide.train, method = "under",
                           p = 0.5, seed = 1)$data
table(data.bal.un$cls)

## 
##  0  1 
## 19 20

plot(data.bal.un$x1,data.bal.un$x2,col=data.bal.un$cls, pch=20)

#entrambe le cose

data.bal.ou <- ovun.sample(cls ~ ., data = hacide.train, method = "both",
                           N = 1000, p = 0.5, seed = 1)$data
table(data.bal.ou$cls)

## 
##   0   1 
## 520 480

Creare dati sintetici con ROSE

data.rose <- ROSE(cls ~ ., data = hacide.train, seed = 1)$data
table(data.rose$cls)

## 
##   0   1 
## 520 480

plot(data.rose$x1,data.rose$x2,col=data.rose$cls, pch=20)

Alberi di classificazione e previsione

tree.rose <- rpart(cls ~ ., data = data.rose)
tree.ov <- rpart(cls ~ ., data = data.bal.ov)
tree.un <- rpart(cls ~ ., data = data.bal.un)
tree.ou <- rpart(cls ~ ., data = data.bal.ou)

pred.tree.rose <- predict(tree.rose, newdata = hacide.test)
pred.tree.ov <- predict(tree.ov, newdata = hacide.test)
pred.tree.un <- predict(tree.un, newdata = hacide.test)
pred.tree.ou <- predict(tree.un, newdata = hacide.test)

roc.curve(hacide.test$cls, pred.tree.rose[,2])

## Area under the curve (AUC): 0.993

roc.curve(hacide.test$cls, pred.tree.ov[,2], add.roc = TRUE, col = 2, lty = 2)

## Area under the curve (AUC): 0.798

roc.curve(hacide.test$cls, pred.tree.un[,2], add.roc = TRUE, col = 3, lty = 3)

## Area under the curve (AUC): 0.833

roc.curve(hacide.test$cls, pred.tree.ou[,2], add.roc = TRUE, col = 4, lty = 4)

## Area under the curve (AUC): 0.833

Ora proviamo con SMOTE

In R esistono vari pacchetti (ad esempio nella vecchia versione del pacchetto DMwR o nel pacchetto smotefamily)

library(smotefamily)
data.smote<-SMOTE(X=hacide.train[,-1],target=hacide.train[,1])
data.sm<-data.smote$data
plot(data.sm$x1,data.sm$x2,col=(as.numeric(data.sm$class)+1), pch=20)

tree.smote <- rpart(class ~ ., data = data.sm)
pred.tree.smote <- predict(tree.smote, newdata = hacide.test)
roc.curve(hacide.test$cls, pred.tree.smote[,2])

## Area under the curve (AUC): 0.970

Dati sbilanciati

Nicola Torelli