# Introduzione #------------------------------------------------------------ library("ggplot2") library("cluster") ## Example with Simulated data: # average number of times food-away-from-home is purchased from restaurants (monthly) # and food-at-home expenditures (annual) for a set of 100 families FoodS<-read.csv("food_spending.csv", header=TRUE) head(FoodS) dim(FoodS) ggplot(FoodS, aes(FreqAway, Home)) + geom_point() #------------------------------------------------------------ ### 1. Dissimilarity Matrix Calculation #------------------------------------------------------------ # distance matrix via dist function ?dist D.<- dist(FoodS) # Euclidean distance str(D.) head(D.) # coercion to matrix and #selection of 5 rows/columns Dmat<-as.matrix(D.) Dmat[1:5, 1:5] range(Dmat) #View(Dmat) summary(Dmat) # function in package "cluster" ?daisy dist1<-daisy(FoodS) # apply daisy with default options # Euclidean distance; # no standardization; no weights str(dist1) # dissimilarity object as.matrix(dist1)[1:5, 1:5] apply(FoodS, 2, var) apply(FoodS, 2, mean) #------------------------------------------------------------ # Standardization #------------------------------------------------------------ # via function daisy dist2<-daisy(FoodS, stand = TRUE) as.matrix(dist2)[1:5, 1:5] # what does "stand = TRUE" do? # Measurements are standardized for each variable # by subtracting the variable's mean value # and dividing by the variable's mean absolute deviation. FoodS2<-matrix(NA, nrow=nrow(FoodS), ncol=2) for (j in 1:ncol(FoodS)){FoodS2[,j]=(FoodS[,j]- mean(FoodS[,j],na.rm=TRUE))/ mean(abs(FoodS[,j]-mean(FoodS[,j],na.rm=TRUE)))} head(FoodS2) as.matrix(dist(FoodS2)) # stopifnot(dist(FoodS2)==dist2) # Esercizio: # 1. Obtain the matrix of Manhattan distances for the FoodS data # 2. Next, use function scale() to standardize data and # 3. compute again the matrix of Manhattan distances # 4. Compare with maximum Md<-as.matrix(dist(FoodS, method = "manhattan")) hist(Md) scFoodS<-scale(FoodS) #centered, scaled matrix (x-mu)/sd Md2<-as.matrix(dist(scFoodS, method = "manhattan")) hist(Md2) #------------------------------------------------------------ # Mixed-data distance #------------------------------------------------------------ # simulated claims data set # “litig”: contestazione (categoriale) # “soft_injury”: lesioni dei tessuti molli (categoriale) # “emergency_tr”: intervento di cure d'emergenza (categoriale) # “NumTreat”: number of medical treatments (numerica) d<-read.csv(file="claims.csv", header = TRUE) head(d) summary(d$NumTreat) # re-code factors d$litig<-as.factor(d$litig) d$soft_injury<-as.factor(d$soft_injury) d$emergency_tr<-as.factor(d$emergency_tr) head(d) str(d) summary(d) #save(d, file = "claims2.RData") #load("claims2.RData") # explore the variables... table(d$litig) prop.table(table(d$litig)) barplot(prop.table(table(d$litig)), ylim=c(0,1), main="variabile: Litig") library(cluster) # compute Gower's dissimilarity diss<-as.matrix(daisy(d, metric= "gower")) # explore the result diss[1:5, 1:5] range(diss) which(diss == max(diss), arr.ind = TRUE) which(diss == max(diss), arr.ind = TRUE)[1,] d[c(656, 69), ]