library(leaps)

# Function that returns Root Mean Squared Error
rmse = function(m){
  return(sqrt(mean(m$residuals^2)))
}
rmse2 <- function(error)
{
  sqrt(mean(error^2))
}

# Function that returns Mean Absolute Error
mae = function(m){
  return(mean(abs(m$residuals)))
}
mae2 <- function(error)
{
  mean(abs(error))
}

descriptors = read.csv("mix_desc.csv")
train = read.csv("mix_train.csv",sep=";")
dtrain = merge(train[,c(1,3)],descriptors,by.x="NSC",by.y="Name")
test = read.csv("mix_test.csv",sep=";")
dtest = merge(test[,c(1,3)],descriptors,by.x="NSC",by.y="Name")

dtrain = dtrain[complete.cases(dtrain),]
dtest = dtest[complete.cases(dtest),]

remove = c()
for(i in 3:1445){
  if(var(dtrain[i])<0.1){
    print(names(dtrain)[i])
    remove = append(remove,i)
  }
}
dtrain = dtrain[,-remove]

n = 45
model = regsubsets(pKa~.,data = dtrain[-1],method="forward",nvmax=n)
summary(model)$rsq
summary(model)$adjr2
models = c()
tersq = c()
trrms = c()
terms = c()
for(i in 1:n){
  #print(which(summary(model)$which[i,]==TRUE))
  f = paste(names(which(summary(model)$which[i,]==TRUE))[-1],collapse="+")
  print(f)
  m = lm(paste("pKa~",f),data=dtrain)
  models[[i]] = m
  p = predict(m,newdata = dtest)
  tersq = append(tersq,cor(dtest$pKa,p)^2)
  trrms = append(trrms,rmse(m))
  terms = append(terms,rmse2(dtest$pKa-p))
}

plot(summary(model)$rsq)
points(tersq,col=2)

plot(trrms)
points(terms,col=2)

m = models[[10]]
summary(m)
plot(dtrain$pKa,predict(m))
points(dtest$pKa,predict(m,newdata=dtest),col=2)


##############

#install.packages("randomForest")
library(randomForest)

forest = randomForest(pKa~.,data=dtrain[-1],importance=TRUE)
forest
plot(forest)
forest = randomForest(pKa~.,data=dtrain[-1],ntree=100)
forest
plot(forest)


plot(dtrain$pKa,predict(forest))
points(dtest$pKa,predict(forest,newdata=dtest),col=2)

cor(dtrain$pKa,predict(forest))^2
cor(dtest$pKa,predict(forest,newdata=dtest))^2

rmse2(dtrain$pKa-predict(forest))
rmse2(dtest$pKa-predict(forest,newdata=dtest))

dtest = dtest[,-remove]
forest2 = randomForest(x=dtrain[-c(1,2)],y=dtrain$pKa,xtest=dtest[-c(1,2)],ytest=dtest$pKa,ntree=100)

plot(forest2$rsq)
points(forest2$test$rsq,col=2)

plot(forest2$test$mse,col=2)
points(forest2$mse)