library(leaps) descriptors = read.csv("mix_desc.csv") train = read.csv("mix_train.csv",sep=";") dtrain = merge(train[,c(1,3)],descriptors,by.x="NSC",by.y="Name") test = read.csv("mix_test.csv",sep=";") dtest = merge(test[,c(1,3)],descriptors,by.x="NSC",by.y="Name") dtrain = dtrain[complete.cases(dtrain),] dtest = dtest[complete.cases(dtest),] remove = c() for(i in 3:1445){ if(var(dtrain[i])<0.1){ print(names(dtrain)[i]) remove = append(remove,i) } } dtrain = dtrain[,-remove] n = 45 model = regsubsets(pKa~.,data = dtrain[-1],method="forward",nvmax=n) summary(model)$rsq summary(model)$adjr2 models = c() trsq = c() for(i in 1:n){ #print(which(summary(model)$which[i,]==TRUE)) f = paste(names(which(summary(model)$which[i,]==TRUE))[-1],collapse="+") print(f) m = lm(paste("pKa~",f),data=dtrain) models[[i]] = m p = predict(m,newdata = dtest) trsq = append(trsq,cor(dtest$pKa,p)^2) } plot(summary(model)$rsq) points(trsq,col=2)