setwd('~/Dropbox/_teaching/Advanced Chemoinformatics/2017/data/') experiment = read.csv("IC50_filtered.csv",sep="\t") descriptors = read.csv("IC50_desc.csv") dataset = merge(experiment[,-2],descriptors,by.x="CMPD_CHEMBLID",by.y="Name") names(dataset)[2] = 'IC50' dataset = dataset[complete.cases(dataset),] dataset = dataset[dataset$IC50!=0,] remove = c() for(i in 3:dim(dataset)[2]){ if(var(dataset[i])<0.1){ print(names(dataset)[i]) remove = append(remove,i) } } dataset = dataset[,-remove] names(dataset)[2] = 'PIC50' dataset$PIC50 = log(dataset$PIC50) s = sample(1:3063,1000) dtrain = dataset[-s,] dtest = dataset[s,] library(leaps) # Function that returns Root Mean Squared Error rmse = function(m){ return(sqrt(mean(m$residuals^2))) } rmse2 <- function(error) { sqrt(mean(error^2)) } n = 200 model = regsubsets(PIC50~.,data = dtrain[-1],method="forward",nvmax=n) summary(model)$rsq summary(model)$adjr2 models = c() tersq = c() trrms = c() terms = c() for(i in 1:n){ #print(which(summary(model)$which[i,]==TRUE)) f = paste(names(which(summary(model)$which[i,]==TRUE))[-1],collapse="+") #print(f) m = lm(paste("PIC50~",f),data=dtrain) models[[i]] = m p = predict(m,newdata = dtest) tersq = append(tersq,cor(dtest$PIC50,p)^2) trrms = append(trrms,rmse(m)) terms = append(terms,rmse2(dtest$PIC50-p)) } plot(summary(model)$rsq) points(tersq,col=2) plot(dtrain$PIC50,predict(models[[60]])) ### library(randomForest) forest2 = randomForest(x=dtrain[-c(1,2)],y=dtrain$PIC50,xtest=dtest[-c(1,2)],ytest=dtest$PIC50,ntree=200) plot(forest2$rsq) points(forest2$test$rsq,col=2) plot(forest2$test$mse,col=2) points(forest2$mse) library(neuralnet) maxs <- apply(dataset[,-1], 2, max) mins <- apply(dataset[,-1], 2, min) scaled <- as.data.frame(scale(dataset[,-1], center = mins, scale = maxs - mins)) ntrain <- scaled[-s,] ntest <- scaled[s,] f <- as.formula(paste("PIC50 ~", paste(names(dataset)[-c(1,2)], collapse = " + "))) nn <- neuralnet(f,data=ntrain,hidden=c(5,3),linear.output=T) prtr <- compute(nn,ntrain[,2:690]) prte <- compute(nn,ntest[,2:690]) cor(prtr$net.result,ntrain$PIC50)^2 cor(prte$net.result,ntest$PIC50)^2