# Function that returns Root Mean Squared Error rmse = function(m) { return(sqrt(mean(m$residuals^2))) } rmse2 <- function(error) { sqrt(mean(error^2)) } # Function that returns Mean Absolute Error mae = function(m) { return(mean(abs(m$residuals))) } mae2 <- function(error) { mean(abs(error)) } ## USING: # model <- lm(y ~ x) # rmse(model$residuals) # root mean squared error # Cross validation function cv <- function(property, descriptors, df, k=5) { formula = sprintf("%s ~ %s",property,descriptors) df2 <- df[order(df[,property]),] statistics = list(train=list(r2=c(),rmse=c(),mae=c()),test=list(r2=c(),rmse=c(),mae=c()),both=list(r2=c(),rmse=c(),mae=c())) sets=list() for (i in 1:k) { sets[[i]] = as.numeric(rownames(df2[i,])) } for (i in k:nrow(df2)) { id = i %% k + 1 sets[[id]] = append(sets[[id]], as.numeric(rownames(df2[i,]))) } print("Cross validation:",quote=F) print("=================",quote=F) print(sprintf("Formula: %s",formula),quote=F) model = lm(formula,df) print(sprintf("R2 : %f",summary(model)$r.squared),quote=F) print(sprintf("RMSE : %f",rmse2(model$residuals)),quote=F) print(sprintf("MAE : %f",mae2(model$residuals)),quote=F) print("-------------------------------------------------------------------------------------------",quote=F) print(" | Train set | Test set | Both",quote=F) print("Fold | R2 RMSE MAE | R2 RMSE MAE | R2 RMSE MAE ",quote=F) print("-------------------------------------------------------------------------------------------",quote=F) p = plot(df[,property],predict(model),xlab="experimental",ylab="predict") maxy = max(predict(model)) minx = min(df[,property]) step = (maxy-min(predict(model)))/16 models=list('train','test') for (i in 1:k) { model = lm(formula, data = df2[-sets[[i]],]) models[['train']][[i]] = model p = abline(lm(df2[-sets[[i]],property]~predict(model)), col=i+1) p = points(df2[-sets[[i]],property],predict(model),col = i+1,pch=2) p = points(df2[sets[[i]],property],predict(model,df2[sets[[i]],]),col = i+1,pch=3) p = legend(minx,y=(maxy-(i-1)*step),sprintf('Fold %i',i),col=i+1,pch = 2, pt.bg = "white", lty = 1) r2_train = summary(model)$r.squared rmse_train = rmse2(model$residuals) mae_train = mae2(model$residuals) pred = predict(model,df2[sets[[i]],]) res = pred - df2[sets[[i]],property] r2_test = cor(pred, df2[sets[[i]],property])^2 rmse_test = rmse2(res) mae_test = mae2(res) r2 = cor(c(predict(model),pred),c(df2[-sets[[i]],property],df2[sets[[i]],property]))^2 rmse = rmse2(c(model$residuals,res)) mae = mae2(c(model$residuals,res)) print(sprintf("%4i | %f %f %f | %f %f %f | %f %f %f",i,r2_train,rmse_train,mae_train,r2_test,rmse_test,mae_test,r2,rmse,mae),quote=F) statistics$train$r2 = append(statistics$train$r2,r2_train) statistics$train$rmse = append(statistics$train$rmse,rmse_train) statistics$train$mae = append(statistics$train$mae,mae_train) statistics$test$r2 = append(statistics$test$r2,r2_test) statistics$test$rmse = append(statistics$test$rmse,rmse_test) statistics$test$mae = append(statistics$test$mae,mae_test) statistics$both$r2 = append(statistics$both$r2,r2) statistics$both$rmse = append(statistics$both$rmse,rmse) statistics$both$mae = append(statistics$both$mae,mae) } print("-------------------------------------------------------------------------------------------",quote=F) print(sprintf("Avrg | %f %f %f | %f %f %f | %f %f %f",mean(statistics$train$r2),mean(statistics$train$rmse), mean(statistics$train$mae),mean(statistics$test$r2),mean(statistics$test$rmse),mean(statistics$test$mae), mean(statistics$both$r2),mean(statistics$both$rmse),mean(statistics$both$mae)),quote=F) print(sprintf("Ext | %f %f %f | %f %f %f | %f %f %f",min(statistics$train$r2),max(statistics$train$rmse),max(statistics$train$mae), min(statistics$test$r2),max(statistics$test$rmse),max(statistics$test$mae), min(statistics$both$r2),max(statistics$both$rmse),max(statistics$both$mae)),quote=F) print("-------------------------------------------------------------------------------------------",quote=F) return(list(sets=sets,models=models,statistics=statistics,plot=p)) } ## USING test = cv("pKa", "qH + qO + qOd:acid", df = data,k=5) # test = cv("pKa", "qH + qO + qOd", df = exercise1,k=5)