library(caret) # leggere i dati churn<-read.csv("Churn_Modelling.csv", header=T) # selezionare le sole variabili di interesse churn<-churn[,4:14] # definire le variabili che sono fattori qualitatitivi churn$Geography<-factor(churn$Geography) churn$Gender<-factor(churn$Gender) churn$HasCrCard<-factor(churn$HasCrCard) churn$IsActiveMember<-factor(churn$IsActiveMember) churn$Exited<-factor(churn$Exited) levels(churn$Exited)<-c("no","yes") levels(churn$IsActiveMember)<-c("no","yes") levels(churn$HasCrCard)<-c("no","yes") #creare la partizione (con caret) set.seed(1234) trainID<-createDataPartition(churn$Exited,p=0.75, list=F) testID<-setdiff(1:length(churn$Exited), trainID) train<-churn[trainID,] test<-churn[testID,] #creare un albero di classificazione (con rpart) library(rpart) alb<-rpart(Exited~Geography+Gender+ HasCrCard+Age+Tenure+Balance+NumOfProducts+ EstimatedSalary,data=train, method="class") alb plot(alb) text(alb, cex=.3) plot.tree(alb) library(rpart.plot) prp(alb) # albero di classificazione con gestione della complessità (in rpart) alb1<-rpart(Exited~Geography+Gender+HasCrCard+ Age+Tenure+Balance+NumOfProducts+ EstimatedSalary,data=train, method="class", control = rpart.control(cp = 0.001)) alb1 plot(alb1) text(alb1) plotcp(alb1) # potare l'albero alb2<-prune(alb1, cp=0.0046) plot(alb2) text(alb2, cex=0.5) # valutare le performance # in outupt il vettore delle classificazioni pred<-predict(alb,test, type="vector") pred=factor(pred) levels(pred)<-c("pno","pyes") table(pred,test$Exited) # in output le probabilità pred1<-predict(alb,test, type="prob") predclass<-pred1[,1]<0.75 predclass<-factor(predclass) levels(predclass)<-c("pno","pyes") table(predclass,test$Exited) # Usare ROCR per ottenere la curva ROC library(ROCR) previsto <- prediction(pred1[,1],test$Exited) perf <- performance(previsto, measure = "fpr", x.measure = "tpr", ) plot(perf, col=3) abline(0,1) auc<-performance(previsto,"auc") auc@y.values # un esempio di uso dei comandi in caret mod_albero = train(Exited~., data = train, method = "rpart") pred_alb = predict(mod_albero, test,type = "prob")[,2] mod_albero$results predcaret<-pred_alb>0.5 levels(predcaret)<-c("pno","pyes") table(predcaret,test$Exited) # proviamo un modello con naive Bayes (serve il package e1071) library(e1071) nbclass<-naiveBayes(Exited~Geography+ Gender+HasCrCard+Age+ Tenure+Balance+NumOfProducts+ EstimatedSalary, data=train) nbpred<-predict(nbclass, test, type = c( "raw")) prednb<-nbpred[,1]<0.75 prednb<-factor(prednb) levels(prednb)<-c("pno","pyes") table(prednb,test$Exited) # e con KNN KNNclass<-gknn(Exited~Geography+Gender+ HasCrCard+Age+Tenure+Balance+ NumOfProducts+EstimatedSalary, data = train, k=15, scale = TRUE) predKNN<-predict(KNNclass, test, type="prob") KNNclass<-predKNN[,1]<0.5 KNNclass<-factor(KNNclass) levels(KNNclass)<-c("pno","pyes") table(KNNclass,test$Exited)