library(leaps) # Function that returns Root Mean Squared Error rmse = function(m){ return(sqrt(mean(m$residuals^2))) } rmse2 <- function(error) { sqrt(mean(error^2)) } # Function that returns Mean Absolute Error mae = function(m){ return(mean(abs(m$residuals))) } mae2 <- function(error) { mean(abs(error)) } descriptors = read.csv("mix_desc.csv") train = read.csv("mix_train.csv",sep=";") dtrain = merge(train[,c(1,3)],descriptors,by.x="NSC",by.y="Name") test = read.csv("mix_test.csv",sep=";") dtest = merge(test[,c(1,3)],descriptors,by.x="NSC",by.y="Name") dtrain = dtrain[complete.cases(dtrain),] dtest = dtest[complete.cases(dtest),] remove = c() for(i in 3:1445){ if(var(dtrain[i])<0.1){ print(names(dtrain)[i]) remove = append(remove,i) } } dtrain = dtrain[,-remove] n = 45 model = regsubsets(pKa~.,data = dtrain[-1],method="forward",nvmax=n) summary(model)$rsq summary(model)$adjr2 models = c() tersq = c() trrms = c() terms = c() for(i in 1:n){ #print(which(summary(model)$which[i,]==TRUE)) f = paste(names(which(summary(model)$which[i,]==TRUE))[-1],collapse="+") print(f) m = lm(paste("pKa~",f),data=dtrain) models[[i]] = m p = predict(m,newdata = dtest) tersq = append(tersq,cor(dtest$pKa,p)^2) trrms = append(trrms,rmse(m)) terms = append(terms,rmse2(dtest$pKa-p)) } plot(summary(model)$rsq) points(tersq,col=2) plot(trrms) points(terms,col=2) m = models[[10]] summary(m) plot(dtrain$pKa,predict(m)) points(dtest$pKa,predict(m,newdata=dtest),col=2) ############## #install.packages("randomForest") library(randomForest) forest = randomForest(pKa~.,data=dtrain[-1],importance=TRUE) forest plot(forest) forest = randomForest(pKa~.,data=dtrain[-1],ntree=100) forest plot(forest) plot(dtrain$pKa,predict(forest)) points(dtest$pKa,predict(forest,newdata=dtest),col=2) cor(dtrain$pKa,predict(forest))^2 cor(dtest$pKa,predict(forest,newdata=dtest))^2 rmse2(dtrain$pKa-predict(forest)) rmse2(dtest$pKa-predict(forest,newdata=dtest)) dtest = dtest[,-remove] forest2 = randomForest(x=dtrain[-c(1,2)],y=dtrain$pKa,xtest=dtest[-c(1,2)],ytest=dtest$pKa,ntree=100) plot(forest2$rsq) points(forest2$test$rsq,col=2) plot(forest2$test$mse,col=2) points(forest2$mse)