## 1. Esempio di regressione polinomiale con dati simulati set.seed(7355826) # Create example data frame x <- rnorm(200) y <- rnorm(200) + 0.2 * x^3 data <- data.frame(x, y) head(data) my_mod <- lm(y ~ poly(x, 4),# Estimate polynomial regression model data = data) summary(my_mod) my_mod2 <- lm(y ~ poly(x, 3),# Estimate polynomial regression model data = data) summary(my_mod2) # Simple plotting plot(y ~ x, data) lines(sort(data$x), # Draw polynomial regression curve fitted(my_mod)[order(data$x)], col = "red", type = "l") # ggplot2 library("ggplot2") ggp <- ggplot(data, aes(x, y)) + # Create ggplot2 scatterplot geom_point() ggp ggp + # Add polynomial regression curve stat_smooth(method = "lm", formula = y ~ poly(x, 4), se = FALSE) ggp + # Regression curve & confidence band stat_smooth(method = "lm", formula = y ~ poly(x, 4)) # stat_smooth(method = "lm", # formula = y ~ poly(x, 1), # color= "red")+ # stat_smooth(method = "lm", # formula = y ~ poly(x, 2), # color= "green") ## 2. Prostate data and coefficient visualization library(glmnet) # regolarizzazione #library(tidyverse) library(dplyr) library(arm) library(ggplot2) library(GGally) # analisi esplorative e lm prostate_data <- read.csv("prostate.csv", sep=",", header = TRUE) # load data dim(prostate_data) head(prostate_data) vars_cont <- prostate_data %>% dplyr::select(lcavol , lweight, age, lbph, lcp, lpsa) # Calcolare la matrice di correlazione cor_matrix <- cor(vars_cont, use = "complete.obs") # Stampare la matrice di correlazione print(cor_matrix) # Visualizzare la matrice di correlazione # Correlation plot to spot eventual nonlinearities library(corrplot) corrplot(cor_matrix, method = "circle") pairs(vars_cont) # Creare un grafico scatterplot matrix pių personalizzato ggpairs(vars_cont, title = "") ggsave(file="matrice_correlazione.pdf", width = 12, height = 10) y <- prostate_data$lpsa # response variable train <- prostate_data$train test <- (-train) prostate_data <- prostate_data[,-c(1,10,11)] # clean data dim(prostate_data) grid <- 10^seq(10, -2, length = 100) plot(prostate_data$age, y) # Provo un modello lineare con tutte le variabili mod <- lm(y ~ . , data = prostate_data) # linear regression summary(mod) se <- summary(mod)$coefficients[,2] beta_names_expr=c() # names' assignments for the plot rendering beta_names_expr[1]=expression(beta[0]) beta_names_expr[2]=expression(beta[1]) beta_names_expr[3]=expression(beta[2]) beta_names_expr[4]=expression(beta[3]) beta_names_expr[5]=expression(beta[4]) beta_names_expr[6]=expression(beta[5]) beta_names_expr[7]=expression(beta[6]) beta_names_expr[8]=expression(beta[7]) beta_names_expr[9]=expression(beta[8]) beta_names <- paste0("beta[", 9:1, "]") coef_plot.1 <- coefplot(as.numeric(as.vector(coef(mod))), # coefplot for parameters sds = as.numeric(as.vector(se)), CI = 2, varnames=(beta_names_expr)) coef_plot.1 # Regressione Polinomiale mod.ridotto <- lm(y ~ lcavol + lweight + age + lbph +svi, data = prostate_data) summary(mod.ridotto) AIC(mod.ridotto) mod.poly <- lm(y ~ poly(lcavol,3) + lweight + age + lbph +svi, data = prostate_data) summary(mod.poly) AIC(mod.poly) mod.poly2 <- lm(y ~ lcavol + poly(lweight,3) + age + lbph +svi, data = prostate_data) summary(mod.poly2) anova(mod.ridotto, mod.poly) plot(prostate_data$lcavol, y) x <- prostate_data$lcavol data <- data.frame(x,y) mod.marg1 <- lm (y~ poly(lcavol,1), data = prostate_data) mod.marg2 <- lm (y~ poly(lcavol,2), data = prostate_data) mod.marg3 <- lm (y~ poly(lcavol,3), data = prostate_data) mod.marg4 <- lm (y~ poly(lcavol,10), data = prostate_data) lines(sort(x), fitted(mod.marg1)[order(x)], col =2) lines(sort(x), fitted(mod.marg2)[order(x)], col =3) lines(sort(x), fitted(mod.marg3)[order(x)], col =4) lines(sort(x), fitted(mod.marg4)[order(x)], col =5) anova(mod.marg1, mod.marg3) ggp <- ggplot(data, aes(x, y)) + # Create ggplot2 scatterplot geom_point() + # Regression curve & confidence band stat_smooth(method = "lm", formula = y ~ poly(x, 3)) ggp # Step function: regressione a gradini mod.step <- lm(y ~ cut(age, 4) + cut(lcavol,4) + cut(lweight,4) + cut(lbph,4) + cut(svi,2), data = prostate_data) summary(mod.step) coef(summary(mod.step)) mod.step.0 <- lm(y ~ cut(lcavol,3), data = prostate_data) mod.step.1 <- lm(y ~ cut(lcavol,4), data = prostate_data) mod.step.2 <- lm(y ~ cut(lcavol,5), data = prostate_data) plot(prostate_data$lcavol, y) lines(sort(x), fitted(mod.step.1)[order(x)], col =2) anova(mod.step.0, mod.step.1, mod.step.2) anova(mod.step.1, mod.step.2) # Lasso x <- model.matrix(y ~ ., prostate_data) mod.lasso <- glmnet(x, y, alpha = 1, lambda = grid ) # alpha = 1 č per il lasso plot(mod.lasso) set.seed(1) cv.out <- cv.glmnet(x, y, alpha = 1, lambda = grid) plot(cv.out) bestlam <- cv.out$lambda.min lasso.pred <- predict(mod.lasso, s = bestlam, newx = x[test, ]) mean((lasso.pred - y[test])^2) out <- glmnet(x, y, alpha = 1, lambda = grid) lasso.coef <- predict(out, type = "coefficients", s = bestlam) lasso.coef <- lasso.coef[-2,] coef_plot.2 <- coefplot(as.numeric(as.vector(lasso.coef)), sds = 0, CI = 2, varnames = (beta_names_expr), xlim =c(-0.6, 0.6)) par(mfrow=c(1,2)) coefplot(as.numeric(as.vector(coef(mod))), # coefplot for lm parameters sds = as.numeric(as.vector(se)), CI = 2, varnames=(beta_names_expr)) coefplot(as.numeric(as.vector(lasso.coef)), # coefplot for lasso parameters sds = 0, CI = 2, varnames = (beta_names_expr), xlim =c(-2, 3)) # Local regression par(mfrow=c(1,1)) xx <- prostate_data$lcavol xxlims <- range(xx) xx.grid <- seq(from = xxlims[1], to = xxlims[2]) plot(xx, y, cex = .5, col = "darkgrey") title("Local Regression") fit <- loess(y ~ xx, span = .2) fit2 <- loess(y ~ xx, span = .5) lines(xx.grid, predict(fit, data.frame(xx = xx.grid)), col = "red", lwd = 2) lines(xx.grid, predict(fit2, data.frame(xx = xx.grid)), col = "blue", lwd = 2) legend("topright", legend = c("Span = 0.2", "Span = 0.5"), col = c("red", "blue"), lty = 1, lwd = 2, cex = .8) ############### # 3. Wine data # importare dati wine <- read.csv("winequality-red.csv", sep=",") wine <- wine[,-1] # rimuovo prima colonna # analisi variabili corrplot(cor(wine)) ggpairs(wine) # Faccio un modello ad esempio con qualche effetto polinomiale my_mod <- lm(fixed.acidity ~ poly(citric.acid,2) + poly(density,2), # Estimate polynomial regression model data = wine) ## completare ... # Lasso # divisione in train e test train <- sample( c(TRUE, FALSE), nrow(wine), prob= c(0.7, 0.3), replace = TRUE) test <- (-train) y <- wine$fixed.acidity x <- model.matrix(y ~ ., wine[,-c(1,2)]) x_train <- x[train,] y_train <- y[train] x_test <- x[-train,] y_test <- y[-train] lasso_model <- cv.glmnet(x_train, y_train, alpha = 1) # Visualizzare i risultati del Lasso (lambda ottimale) plot(lasso_model) # Coefficienti con il lambda ottimale coef(lasso_model, s = "lambda.min") ## completare con grafici e analisi...e provare a calcolare il ## mse sui dati di test.