# Introduzione
#------------------------------------------------------------

library("ggplot2")
library("cluster") 

## Example with Simulated data: 
# average number of times food-away-from-home is purchased from restaurants (monthly)
# and food-at-home expenditures (annual) for a set of 100 families

FoodS<-read.csv("food_spending.csv", header=TRUE)

head(FoodS)
dim(FoodS)

ggplot(FoodS, aes(FreqAway, Home)) + 
  geom_point()

#------------------------------------------------------------
### 1. Dissimilarity Matrix Calculation
#------------------------------------------------------------

# distance matrix via dist function
?dist
D.<- dist(FoodS) # Euclidean distance
str(D.)
head(D.)
# coercion to matrix and 
#selection of 5 rows/columns
Dmat<-as.matrix(D.) 
Dmat[1:5, 1:5]
range(Dmat)
#View(Dmat)
summary(Dmat)

# function in package "cluster"
?daisy
dist1<-daisy(FoodS)   # apply daisy with default options 
# Euclidean distance; 
# no standardization; no weights
str(dist1)            # dissimilarity object

as.matrix(dist1)[1:5, 1:5]    

apply(FoodS, 2, var)
apply(FoodS, 2, mean)

#------------------------------------------------------------
# Standardization
#------------------------------------------------------------
# via function daisy
dist2<-daisy(FoodS, stand = TRUE)    
as.matrix(dist2)[1:5, 1:5]

# what does "stand = TRUE" do?
# Measurements are standardized for each variable 
# by subtracting the variable's mean value 
# and dividing by the variable's mean absolute deviation.

FoodS2<-matrix(NA, nrow=nrow(FoodS), ncol=2)
for (j in 1:ncol(FoodS)){FoodS2[,j]=(FoodS[,j]-
                                       mean(FoodS[,j],na.rm=TRUE))/
  mean(abs(FoodS[,j]-mean(FoodS[,j],na.rm=TRUE)))}

head(FoodS2)
as.matrix(dist(FoodS2))

# stopifnot(dist(FoodS2)==dist2)

# Esercizio: 
# 1. Obtain the matrix of Manhattan distances for the FoodS data
# 2. Next, use function scale() to standardize data and 
# 3. compute again the matrix of Manhattan distances
# 4. Compare with maximum 

Md<-as.matrix(dist(FoodS, method = "manhattan"))
hist(Md)
scFoodS<-scale(FoodS)  #centered, scaled matrix (x-mu)/sd
Md2<-as.matrix(dist(scFoodS, method = "manhattan"))
hist(Md2)

#------------------------------------------------------------
# Mixed-data distance 
#------------------------------------------------------------
#  simulated claims data set 
# “litig”: contestazione (categoriale)
# “soft_injury”: lesioni dei tessuti molli  (categoriale)
# “emergency_tr”: intervento di cure d'emergenza (categoriale)
# “NumTreat”: number of medical treatments (numerica)

d<-read.csv(file="claims.csv", header = TRUE)
head(d)
summary(d$NumTreat)
# re-code factors
d$litig<-as.factor(d$litig)
d$soft_injury<-as.factor(d$soft_injury)
d$emergency_tr<-as.factor(d$emergency_tr)
head(d)
str(d)
summary(d)

#save(d, file = "claims2.RData")
#load("claims2.RData")

# explore the variables...
table(d$litig)
prop.table(table(d$litig))
barplot(prop.table(table(d$litig)), ylim=c(0,1), 
        main="variabile: Litig")

library(cluster)
# compute Gower's dissimilarity
diss<-as.matrix(daisy(d, metric= "gower"))

# explore the result
diss[1:5, 1:5]
range(diss)
which(diss == max(diss), arr.ind = TRUE)
which(diss == max(diss), arr.ind = TRUE)[1,]
d[c(656, 69), ]