##################################################################### # Importare i dati ##################################################################### originale<-read.table("Dataset_whiskey.txt", header=TRUE,sep="\t") str(originale) View(originale) # Esplorazione preliminare dei dati plot(originale) boxplot(originale[,-c(1:2)]) # solo colonne con valori numerici # Boxplot per categorie (esempio) boxplot(originale$B~originale$type) ##################################################################### # CORRELAZIONE TRA VARIABILI ##################################################################### # Installare pacchetto corrplot install.packages("corrplot") # Caricare il pacchetto library(corrplot) # Generare la matrice di correlazione: Cr<-cor(originale[,-c(1:2)]) # solo colonne con valori numerici # Visualizzare la matrice di correlazione windows() #nuova finestra per il grafico corrplot(Cr) ##################################################################### # CLUSTERING GERARCHICO ##################################################################### dataset<-as.matrix(originale[,-c(1:2)]) # Seleziono solo la parte di dataset con valori numerici Type<-originale$type Sample<-originale$sample #-- Normalizzazione per variabile (E' NECESSARIA!!!) dataNorm<-scale(dataset, center = TRUE, scale = TRUE) #-- Clustering gerarchico Dist<-dist(dataNorm, method="euclidean") HcC<- hclust(Dist, method="complete") HcS<- hclust(Dist, method="single") HcA<- hclust(Dist, method="average") DendC<-as.dendrogram(HcC) DendS<-as.dendrogram(HcS) DendA<-as.dendrogram(HcA) plot(DendC,main="Complete") windows() plot(DendS,main="Single") windows() plot(DendA,main="Average") #--- Selezione numero cluster Cut<-cutree(HcC, k= 2) Newdata<-data.frame(originale, K2= Cut) View(Newdata) #--- Esplorazione risultati Box<-boxplot(Newdata$A~Newdata$K2,main="Variabile A") Cluster1<-Newdata[which(Newdata$K2==1),] ################################################################### # PCA ################################################################### dataset<-as.matrix(originale[,-c(1:2)]) Type<-originale$type Sample<-originale$sample PCA<-prcomp(dataset, center= TRUE, scale=TRUE) varianze<-PCA$sdev^2 varianzecum<-cumsum(varianze/sum(varianze)*100) #-- Scree plot plot(varianze,pch=16,type="o") abline(h=1,col="gray") #-- Varianza cumulata % plot(varianzecum,pch=16,type="o") abline(h=c(50,75),col="gray") #-- Plot loadings plot(PCA$rotation[,1], PCA$rotation[,2],pch=16) text(PCA$rotation[,1], PCA$rotation[,2],labels=colnames(dataset),cex=0.8,pos=3) abline(h=0,col="gray") abline(v=0,col="gray") #-- Plot scores plot(PCA$x[,1], PCA$x[,2],pch=16) text(PCA$x[,1], PCA$x[,2],labels=Sample,cex=0.8,pos=3) abline(h=0,col="gray") abline(v=0,col="gray") plot(PCA$x[,1], PCA$x[,2],type="n") text(PCA$x[,1], PCA$x[,2],labels=Type,cex=0.8) abline(h=0,col="gray") abline(v=0,col="gray") #-- a colori Color<-as.factor(Type) levels(Color)<-c("red","blue") Color<-as.character(Color) plot(PCA$x[,1], PCA$x[,2],type="n") points(PCA$x[,1], PCA$x[,2],col=Color,cex=1.2,pch=16) abline(h=0,col="gray") abline(v=0,col="gray") #-- Biplot biplot(PCA, choices =c(1,2)) #-- Proiezione di dati newdata<-dataset[c(1,20),] New<-predict(PCA,newdata) plot(PCA$x[,1], PCA$x[,2],type="n") points(PCA$x[,1], PCA$x[,2],col=Color,cex=1.2,pch=16) abline(h=0,col="gray") abline(v=0,col="gray") points(New[,1],New[,2],col="green",pch=18,cex=2)