########################################################################## # # QSAR # ########################################################################## library(pls) originale<-read.table("Dataset AquaticTox_parziale.txt",header=TRUE,sep="\t") View(originale) #Descrizione #These data were compiled and described by He and Jurs (2005). #The data set consists of 322 compounds that were experimentally assessed for toxicity. #The outcome is the negative log of activity (but is labled as "activity"). #The structures and outcomes were obtained from http://www.qsarworld.com/index.php. ## Tolgo righe con dati mancanti (NA) originale<-na.omit(originale) ### Si divide il dataset in Training e Test dataset (random) set.seed(7) Train<-originale[-sample(c(1:nrow(originale)),96),] set.seed(7) Test<-originale[sample(c(1:nrow(originale)),96),] ### Costruisco il modello, ad es. uso PLS come modello: prima bisogna dividere i predittori dalla risposta TrainP<-as.matrix(Train[,c(2:50)])# Predittori TrainR<-as.matrix(Train[,1]) # Risposta # Provo a costruire un modello con una cross validation a "segmenti" Model<-plsr(TrainR ~ TrainP, ncomp = 25, segments = 15, segment.type ="random", validation = "CV") #-- valuto il numero di componenti di interesse per il modello plot(RMSEP(Model), legendpos = "topright") # Per vedere i valori di RMSEP: RMSEP(Model) # 16 componenti ##--Quindi proseguo con la validazione esterna sul Test. TestP<-as.matrix(Test[,c(2:50)])# Predittori TestR<-as.matrix(Test[,1]) # Risposta PredizionePLS<-predict(Model, ncomp= 16 ,newdata=TestP) plot(TestR,PredizionePLS[,,1],pch=16,col="blue",xlab="measured",ylab="predicted",asp=1) abline(a=0,b=1) # retta y=x ##-- Calcolo R^2: summary(lm(PredizionePLS[,,1]~TestR)) # R^2 è 0.75