## Cluster Analysis: DISTANZE E DISSIMILARITA' library("ggplot2") library("cluster") # food_spending.csv FoodS <- read.csv("food_spending.csv", header = TRUE) View(FoodS) dim(FoodS) #scatterplot ggplot(FoodS, aes(FreqAway, Home))+ geom_point() str(FoodS) #plot(FoodS$FreqAway, FoodS$Home) # Distanze/dissimilarità ?dist Dist.e<-dist(FoodS) Dist.e str(Dist.e) head(Dist.e) Dmat<-as.matrix(Dist.e) head(Dmat) dim(Dmat) Dmat[1:5, 1:5] #install.packages("cluster") #library("cluster") ?daisy dist1<-daisy(FoodS) # Euclidean dist. as.matrix(dist1)[1:5, 1:5] # standardization dist2<-daisy(FoodS, stand = TRUE) # as.matrix.. ### Esercizio: distanza di Manhattan su dati standardizzati s.FoodS<-scale(FoodS) # scaled matrix ?dist Md2<-as.matrix(dist(scale(FoodS), method = "manhattan")) hist(Md2) ## 28/04/25 library("cluster") d<-read.csv(file="claims.csv", header = TRUE) head(d) summary(d) # litig: 0=no, 1=si # soft_injury: 0=no, 1=si # emergency_tr: 0=no, 1=si # NumTreat: numero di cure mediche d$litig<-as.factor(d$litig) d$soft_injury<-as.factor(d$soft_injury) d$emergency_tr<-as.factor(d$emergency_tr) #save(d, file = "claims2.RData") load("claims2.RData") table(d$litig) prop.table(table(d$litig)) barplot(prop.table(table(d$litig)), col=2) # levels() # compute Gower's dissimilarity diss_g<-as.matrix(daisy(d, metric = "gower")) diss_g[1:5, 1:5] range(diss_g) which(diss_g==max(diss_g), arr.ind = TRUE)[1,] d[656, ] d[69, ]