# Lab5 ## Insurance data ## Importare dati TL <- read.csv("TL.csv", header=TRUE, sep=",", row.names=1) dim(TL) LFACE <- log(TL$FACE) LINCOME <- log(TL$INCOME) TL2 <- TL[, -c(5,6)] TL2$LINCOME <- with (TL2, LINCOME) TL2$LFACE <- with(TL2, LFACE) library(tree) ## create trainig and test set set.seed(2) train <- sample(1:nrow(TL2), nrow(TL2)/2) ## fit regression tree tree.tl <- tree(LFACE ~ ., TL2, subset = train) summary(tree.tl) plot(tree.tl) text(tree.tl, pretty = 0) ## pruning tree cv.tl <- cv.tree(tree.tl) plot(cv.tl$size, cv.tl$dev, type= "b") prune.tl <- prune.tree(tree.tl, best = 3) plot(prune.tl) text(prune.tl, pretty = 0) ## predictions yhat <- predict(tree.tl, newdata = TL2[-train,]) tl.test <- TL2[-train, "LFACE"] plot(yhat, tl.test) abline(0,1) mean((yhat-tl.test)^2) ## new plot (similar to plot 8.5 of ISL) yhat.tr <- matrix(NA, length(train), 8) yhat.test <- matrix(NA, length(TL2[-train,1]),8) MSE.tr <- c() MSE.test <- c() for (j in 2:8){ tree.sized <- prune.tree(tree.tl, best = j) a<-summary(tree.sized) newTL2 <- as.data.frame(TL2[, as.character(a$used)]) colnames(newTL2) <- as.character(a$used) tree.chosen <- tree(LFACE ~ ., newTL2, subset = train) yhat.tr[,j] <- predict(tree.chosen, newdata=TL2[train, ]) yhat.test[,j] <- predict(tree.chosen, newdata=TL2[-train, ]) MSE.tr[j] <- (1/length(train))*sum((TL2$LFACE[train]-yhat.tr[,j])^2) MSE.test[j] <- (1/length(TL2[-train,1]))*sum((TL2$LFACE[-train]-yhat.test[,j])^2) } plot((2:8), MSE.tr[2:8], type="b", xlab = "Tree size", col="black", ylim=c(1,4)) lines(2:8, MSE.test[2:8], type="b", col= "orange") ## per la consegna library(ISLR2) library(dplyr) newHitters <- Hitters %>% filter(!is.na(Salary)) newHitters$Salary <- log(newHitters$Salary) data(Hitters)