#setting up a working directory. 

setwd("~/ADSA 2019")     #Go to "Session", then "Working Directory", and click on "Choose Directory...". 
                          #Choose a folder in your computer, where you have saved the data and you want to save output (e.g. plots) from R.

#importing data 
dat=read.csv(file="Exercise #2 data file.csv")
#double checking the variables in the data
head(dat)

#####CROSS VALIDATYION WITH CARET PACKAGE######

#installing "caret" package. 
#install.packages("caret") #The "caret" package utilizes a number of other packages. 
                          #R will keep asking you to install those other packages, particulalrly if your R version has not been updated.
                          #So, installation of "caret" package would be much easier with newer R versions (3.5 or above)

#loading "caret" package
library(caret)


###########################Evaluating a simple linear regression model including only DMI ##############################


######THE HOLD-OUT METHOD########

#Traditionally, cross validation is applied by splitting the data into two sets: training and test
#the training and test datasets are used for model development and evaluation, respectively. 
# This first method is called "Hold-out" method where a single, random, split is done

#Splitting the data using "createDataPartition" function and allocating 70% for training (model development) and 30% for testing (model evaluation)
set.seed(123) # set the pseudo-number generator to a specific sequence, just so we all get the same answer
              # In practice, you do not need to set the seed.

partition=createDataPartition(dat$CH4, p=0.7, list=FALSE) #So 70:30 is your partition rule
TrainingData=dat[partition,]
TestData=dat[-partition,]

#Developing a simple linear regression (SLR) model including DMI as the predictor (or the independent) variable
SLR_Hold_Out=lm(data=TrainingData, CH4~DMI)
summary(SLR_Hold_Out)

#Evaluating the model on testing data by calculating root mean square error (RMSE)
preds = predict(SLR_Hold_Out, newdata=TestData)
resid = TestData$CH4 - preds
RMSE = sqrt((t(resid)%*%resid)/nrow(TestData))
RMSE

# calculating the mean of absolute error (MAE)
MAE = mean(abs(resid))
MAE
    
      
#####K-fold Cross Validation#####

### 10-fold (k=10) cross validation (90% of the data for training and 10% for testing over 10 times)###
set.seed(536)
SplitRule=trainControl(method="cv", number=10) # defining the number of folds (=10) for the cross validation (CV)
KF10_SLR=train(CH4~DMI, data=dat, trControl=SplitRule, method="lm") # Executing the 10-fold cv for a linear model (method="lm") including only DMI

#viewing the overall RMSE, R-squared, and MAE (the averages of the 10 indivdual evaluations) 
KF10_SLR$results

#viewing the RMSE, R-squared, and MAE values of each individual fold 
KF10_SLR$resample

#obtaining finalmodel parameters (Fitted with all data)
KF10_SLR$finalModel  #You will get the same parameters, if you develop the SLR model just using all the data (summary(lm(data=dat, CH4~DMI)))
                    
                            
#Plotting observed values against the predicted values
PredCH4_KF10 = predict(KF10_SLR, dat) # obtaining the predicted vales
plot(dat$CH4~PredCH4_KF10, cex=1.4, pch=18, ylim=c(150, 500), xlim=c(150, 500), 
     ylab="Observed", xlab="Predicted", main="K=10 (RMSE = 40.1, R2 = 0.52, MAE=31.7)",  cex.lab=1.3)

Regline=lm(dat$CH4~PredCH4_KF10)
abline(Regline, lty=3, lwd=3.2, col="red")# adding the regression line
abline (0,1) # adding the unity line


######Repeated K-fold cross validation#####                                        
set.seed(536)
SplitRule=trainControl(method="repeatedcv", number=10, repeats=100) #modify the split rule including "repeatedCV" as the method and specifying how many repeats to perform
KF10r_SLR=train(CH4~DMI, data=dat, trControl=SplitRule, method="lm") # Replicating the 10-fold cv 500 times for a linear model (method="lm") including only DMI
                            
#viewing the overall RMSE, R-squared, and MAE (the averages of the 10 indivdual evaluations) 
KF10r_SLR$results
                            
#viewing the RMSE, R-squared, and MAE values of each individual fold and replication
KF10r_SLR$resample
                            
#obtaining finalmodel parameters
KF10r_SLR$finalModel # Same as before because it uses all data (n = 150)

#####Leave-one-out cross validation#####
set.seed(825)
SplitRule=trainControl(method="LOOCV")
LOO_SLR=train(CH4~DMI, data=dat, trControl=SplitRule, method="lm")

LOO_SLR$results

LOO_SLR$finalModel # Again, same as before since it uses all data


###########################Evaluating a multiple linear regression (LMR) model including DMI and EE###########################


##Lets use K-fold (k=10) cross validation in the same way we applied above for the simple linear regression model (KF10_SLR)

set.seed(789)
SplitRule=trainControl(method="cv", number=10) 
KF10_MLR=train(CH4~DMI+EE, data=dat, trControl=SplitRule, method="lm") # Executing the 10-fold cv for a linear model (method="lm") including both DMI and EE

KF10_MLR$results

KF10_MLR$resample

KF10_MLR$finalModel 

######Comparing the performance of the simple (SLR) and multiple (MLR) linear regression models
model_list=list(SLR=KF10_SLR, MLR=KF10_MLR)
performance=resamples(model_list)
summary(performance)



#####NONPARAMETRIC BOOTSTRAP WITH BOOT PACkAGE#####
#install.packages("boot")              
library(boot)

###Let's use bootstrapping to learn about the uncertainty associated with the R-squared of the multiple linear regression model(rsq)

###Create a function to obtain rsq
rsq=function(formula, data, indices){
  d=data[indices,] # d = one sample from the data 
  fit=lm(formula, data=d)
  return(summary(fit)$r.square) 
} 

# Bootstrapping with 1000 bootstrap samples using the boot function. 
# The model will be fitted and rsq is calculated for each bootstrap sample
results=boot(data=dat, statistic=rsq, R=1000, formula=CH4~DMI+EE)

# Let's view the unceratainity of the rsq values
results 
plot(results)

# Nonparametric Bootstrap 95% CI
boot.ci(results, type="bca")

### In this second example, we calculate the standard errors and 95% confidance interval of parameter estimates
# using Nonparametric bootstrapping with 1000 Boostrap samples

### Again we have to create a function to obtain the parameter estimates (regression coefficients) for each sample
bs=function(formula, data, indices){
  d=data[indices,] # d = one sample from the data 
  fit=lm(formula, data=d)
  return(coef(fit)) 
} 
            
#bootstrapping to obtain the parameter estimates of the model for every sample using above function 
results2=boot(data=dat, statistic=bs, R=1000, formula=CH4~DMI+EE)

# bias, standard errors and the distributions of the parameter estimates
results2
plot(results2, index=1) # intercept 
plot(results2, index=2) # slope (DMI)
plot(results2, index=3) # slope (EE)

# Nonparametric Bootstrap 95% CI
boot.ci(results2, type="bca", index=1) # intercept 
boot.ci(results2, type="bca", index=2) # slope (DMI)
boot.ci(results2, type="bca", index=3) # slope (EE)