### Cluster Analysis: K-means & PAM ## 7/05/26 ## PAM library(cluster) library(ggplot2) dat<-read.csv("food_spending.csv", header = TRUE) pam.out<-pam(dat, 2, metric = "euclidean", stand = TRUE) pam.out pam.out$medoids # medoidi pam.out$id.med # indici riga medoidi pam.out$clustering # vettore clustering plot(dat, col=pam.out$clustering+1) # col=2,3 points(pam.out$medoids, pch=8) ## silhouette plot plot(pam.out, which=2, main="") ## average silhouette width per PAM con k=2 pam.out$silinfo$avg.width ## ASW vs K min_nc<-2 max_nc<-10 asw<-array(0, c(max_nc-min_nc+1, 2)) asw[,1]<-c(min_nc:max_nc) for(nc in min_nc:max_nc){ res<-pam(dat, nc, metric = "euclidean", stand=TRUE) asw[nc-min_nc+1, 2]<-res$silinfo$avg.width } # Esempio con "claims" data & distanza di Gower d<-read.csv(file="claims.csv", header = TRUE) #View(d) d$litig<-as.factor(d$litig) d$soft_injury<-as.factor(d$soft_injury) d$emergency_tr<-as.factor(d$emergency_tr) str(d) # distanze di Gower mixd<-daisy(d, metric = "gower") str(mixd) dmatrix<-as.matrix(mixd) #matrice distanze ## PAm con matrice dissimilarità "dmatrix" pam(dmatrix, 2, diss=TRUE) ## utilizzare ASW per decidere K (con k=2,3,...,15) min_nc<-2 max_nc<-15 asw<-array(0, c(max_nc-min_nc+1,2)) asw[,1]<-c(min_nc:max_nc) for(nc in min_nc:max_nc){ res<-pam(dmatrix, nc, diss=TRUE) asw[nc-min_nc+1, 2]<-res$silinfo$avg.width } ggplot(data=data.frame(x=asw[,1], y=asw[,2]), mapping = aes(x=x,y=y)) + geom_point() + geom_line() + xlab("k") + ylab("ASW") # oppure #plot(asw[,1], asw[,2], type="o", xlab="K", ylab="ASW") #abline(v=8, lty=3)