#setting up a working directory. setwd("~/ADSA 2019") #Go to "Session", then "Working Directory", and click on "Choose Directory...". #Choose a folder in your computer, where you have saved the data and you want to save output (e.g. plots) from R. #importing data dat=read.csv(file="Exercise #2 data file.csv") #double checking the variables in the data head(dat) #####CROSS VALIDATYION WITH CARET PACKAGE###### #installing "caret" package. #install.packages("caret") #The "caret" package utilizes a number of other packages. #R will keep asking you to install those other packages, particulalrly if your R version has not been updated. #So, installation of "caret" package would be much easier with newer R versions (3.5 or above) #loading "caret" package library(caret) ###########################Evaluating a simple linear regression model including only DMI ############################## ######THE HOLD-OUT METHOD######## #Traditionally, cross validation is applied by splitting the data into two sets: training and test #the training and test datasets are used for model development and evaluation, respectively. # This first method is called "Hold-out" method where a single, random, split is done #Splitting the data using "createDataPartition" function and allocating 70% for training (model development) and 30% for testing (model evaluation) set.seed(123) # set the pseudo-number generator to a specific sequence, just so we all get the same answer # In practice, you do not need to set the seed. partition=createDataPartition(dat$CH4, p=0.7, list=FALSE) #So 70:30 is your partition rule TrainingData=dat[partition,] TestData=dat[-partition,] #Developing a simple linear regression (SLR) model including DMI as the predictor (or the independent) variable SLR_Hold_Out=lm(data=TrainingData, CH4~DMI) summary(SLR_Hold_Out) #Evaluating the model on testing data by calculating root mean square error (RMSE) preds = predict(SLR_Hold_Out, newdata=TestData) resid = TestData$CH4 - preds RMSE = sqrt((t(resid)%*%resid)/nrow(TestData)) RMSE # calculating the mean of absolute error (MAE) MAE = mean(abs(resid)) MAE #####K-fold Cross Validation##### ### 10-fold (k=10) cross validation (90% of the data for training and 10% for testing over 10 times)### set.seed(536) SplitRule=trainControl(method="cv", number=10) # defining the number of folds (=10) for the cross validation (CV) KF10_SLR=train(CH4~DMI, data=dat, trControl=SplitRule, method="lm") # Executing the 10-fold cv for a linear model (method="lm") including only DMI #viewing the overall RMSE, R-squared, and MAE (the averages of the 10 indivdual evaluations) KF10_SLR$results #viewing the RMSE, R-squared, and MAE values of each individual fold KF10_SLR$resample #obtaining finalmodel parameters (Fitted with all data) KF10_SLR$finalModel #You will get the same parameters, if you develop the SLR model just using all the data (summary(lm(data=dat, CH4~DMI))) #Plotting observed values against the predicted values PredCH4_KF10 = predict(KF10_SLR, dat) # obtaining the predicted vales plot(dat$CH4~PredCH4_KF10, cex=1.4, pch=18, ylim=c(150, 500), xlim=c(150, 500), ylab="Observed", xlab="Predicted", main="K=10 (RMSE = 40.1, R2 = 0.52, MAE=31.7)", cex.lab=1.3) Regline=lm(dat$CH4~PredCH4_KF10) abline(Regline, lty=3, lwd=3.2, col="red")# adding the regression line abline (0,1) # adding the unity line ######Repeated K-fold cross validation##### set.seed(536) SplitRule=trainControl(method="repeatedcv", number=10, repeats=100) #modify the split rule including "repeatedCV" as the method and specifying how many repeats to perform KF10r_SLR=train(CH4~DMI, data=dat, trControl=SplitRule, method="lm") # Replicating the 10-fold cv 500 times for a linear model (method="lm") including only DMI #viewing the overall RMSE, R-squared, and MAE (the averages of the 10 indivdual evaluations) KF10r_SLR$results #viewing the RMSE, R-squared, and MAE values of each individual fold and replication KF10r_SLR$resample #obtaining finalmodel parameters KF10r_SLR$finalModel # Same as before because it uses all data (n = 150) #####Leave-one-out cross validation##### set.seed(825) SplitRule=trainControl(method="LOOCV") LOO_SLR=train(CH4~DMI, data=dat, trControl=SplitRule, method="lm") LOO_SLR$results LOO_SLR$finalModel # Again, same as before since it uses all data ###########################Evaluating a multiple linear regression (LMR) model including DMI and EE########################### ##Lets use K-fold (k=10) cross validation in the same way we applied above for the simple linear regression model (KF10_SLR) set.seed(789) SplitRule=trainControl(method="cv", number=10) KF10_MLR=train(CH4~DMI+EE, data=dat, trControl=SplitRule, method="lm") # Executing the 10-fold cv for a linear model (method="lm") including both DMI and EE KF10_MLR$results KF10_MLR$resample KF10_MLR$finalModel ######Comparing the performance of the simple (SLR) and multiple (MLR) linear regression models model_list=list(SLR=KF10_SLR, MLR=KF10_MLR) performance=resamples(model_list) summary(performance) #####NONPARAMETRIC BOOTSTRAP WITH BOOT PACkAGE##### #install.packages("boot") library(boot) ###Let's use bootstrapping to learn about the uncertainty associated with the R-squared of the multiple linear regression model(rsq) ###Create a function to obtain rsq rsq=function(formula, data, indices){ d=data[indices,] # d = one sample from the data fit=lm(formula, data=d) return(summary(fit)$r.square) } # Bootstrapping with 1000 bootstrap samples using the boot function. # The model will be fitted and rsq is calculated for each bootstrap sample results=boot(data=dat, statistic=rsq, R=1000, formula=CH4~DMI+EE) # Let's view the unceratainity of the rsq values results plot(results) # Nonparametric Bootstrap 95% CI boot.ci(results, type="bca") ### In this second example, we calculate the standard errors and 95% confidance interval of parameter estimates # using Nonparametric bootstrapping with 1000 Boostrap samples ### Again we have to create a function to obtain the parameter estimates (regression coefficients) for each sample bs=function(formula, data, indices){ d=data[indices,] # d = one sample from the data fit=lm(formula, data=d) return(coef(fit)) } #bootstrapping to obtain the parameter estimates of the model for every sample using above function results2=boot(data=dat, statistic=bs, R=1000, formula=CH4~DMI+EE) # bias, standard errors and the distributions of the parameter estimates results2 plot(results2, index=1) # intercept plot(results2, index=2) # slope (DMI) plot(results2, index=3) # slope (EE) # Nonparametric Bootstrap 95% CI boot.ci(results2, type="bca", index=1) # intercept boot.ci(results2, type="bca", index=2) # slope (DMI) boot.ci(results2, type="bca", index=3) # slope (EE)