################################# NANP Nutrition Models Workshop #################################

# Automated model selection (AMS): Part II (exercises) 
# Dr. Veridiana L. Daley Email: veridi7@vt.edu

# Objective: Apply the AMS approach as a tool for the selection of prediction models.
# The dataset used in this lesson can be downloaded from the NANP website (Dairy digestion, https://animalnutrition.org/modeling-database)

# Tips:
      # Use # to make comments (green letters).
      # In the script the codes are in black and blue.
      # In the console the codes are in blue and results in black.
      # Executing code (two options): 
        #1. Select the code and press the Ctrl+Enter.
        #2. Select the code and press the Run toolbar button (on the top right side of this script).
      # Clear the Console press Ctrl+L
      # Restart the RStudio session  Ctrl+Shift+F10

################################ Before the Workshop #############################################

# 1. Install the packages (only run the code below if you DID NOT install the libraries previously)
# install.packages("nameofthepackage")
install.packages("readr")
install.packages("tidyverse")
install.packages("psych")
install.packages("reshape2")
install.packages("gglot2")
install.packages("png")
install.packages("parallel")
install.packages("lme4")
install.packages("MuMIn")
install.packages("sjPlot")
install.packages("epiR")
install.packages("sjlabelled")
install.packages("sjmisc")
install.packages("sjstats")
install.packages("ggeffects")
install.packages("sjPlot")
install.packages("rlang")

# If you want to unload a package because of an error, use the code below and then install the package again
# detach("package:nameofthepackage")

# If you are a new RStudio user, please read the R manual on the NANP website
# Link: https://animalnutrition.org/sites/default/files/2018-06/Installing%20and%20using%20R%20updated%207.26.2017.pdf


################################  Lesson 4 #########################################################


################ 1. Import Dataset ################

# Clean up your environment on RStudio.
    rm(list = ls(all = TRUE)) # Delete all R object
    graphics.off() # Delete all open graphics 

# Set your working directory
    
    # Option A: Set your working directory: Ctrl+Shift+H > Go to USB drive > Choose the folder "Lesson4"> Select Open.
    # Option B: Go to Session > Set Working Directory > Choose directory > Computer > Removable disk (USB drive) > Lesson4 > Open.
    # Example:  
    setwd(" ") # *Fill out with your Working Directory information, you can copy it and paste from Console
 
# Read the data (CSV file) 
    library(readr)   # Load the library necessary to open the data.CSV file (make sure that this library was installed previously, and run the code twice to make sure that it is in blue color on the Console)
    
    # Option A: Load the dataset named "data" (from your USB drive)
    data <- read_csv("Exercise #3 data file.csv") # 658 observations and 26 variables (see it at the "Global Environment" on the top right of the your screen))
    
    # Option B: Go to "Global Environment" > Import Dataset > From Text (readr) > File > data.CSV (USB drive) > Open > Import

################ 2.Data Cleaning ###################

# Show the names of all variables 
    ls(data)   

# Summary of the dataset
    summary(data)
    
# Rename an observation with misspelling case
    View(data)
    data$An_BW <- ifelse(data$An_BW == 58300, 583, data$An_BW) 

# Remove studies (PubIDS) with missing BW data
    data <- subset(data, data$PubID != 2 & data$PubID != 4)
 
# Remove variables  
    ls(data) 
    data$X1 <- NULL 
    data$TrtName <- NULL    
    data$Obs_MilkLacp <- NULL
    
# Add new variables to a data frame
    data$An_BWm <- data$An_BW ^ 0.75 # calculate metabolic body weight 
    
# Create a copy of the database and name it as "d" 
    d <- data # See two data.frame on the Global Environment 
    
# Check column classes and changing it to "numeric" if necessary
    class(d$An_BW) # one variable
    sapply(d, class) # all variables
    
    # If necessary to change the variable class to numeric, use the code example: data$variablename <- as.numeric(data$variablename)
    # If necessary to change the variable class to character, use the code example: data$variablename <- as.character(data$variablename)

################### 3.Plots and Remove Outliers ###################
    library(reshape2) # Run the library
    library(ggplot2)  # Run the library
    
    # Plot 1: Plot of data to indentify outliers
    mlt <- d[c("PubID", "TID", "An_DMIn", "An_BW", "An_LactDays", "Dt_ADF")] # fill out with all variables of the dataset 
    mlt <- melt(mlt, id="TID")
    p1 <- ggplot(mlt, aes(x=value))+geom_density(alpha = 0.2)+facet_wrap(~variable, ncol=6, scales="free") + theme(strip.text = element_text(size = 20, color = "white", hjust = 0.5,vjust = 0.5, face = "bold"),                                                                                                              strip.background = element_rect( fill = "deepskyblue4", color = NA )) + theme(axis.text=element_text(size=15), axis.title=element_text(size=15,face="bold"))
    p1
    
    # Plot 2: Histogram DMI in kg/day
    p2 <- ggplot(d, aes(An_DMIn)) + geom_histogram(color="red", fill="red", alpha = .5, position = "identity") + theme(axis.text=element_text(size=18), axis.title=element_text(size=18,face="bold"))
    p2
    
    # Plot 3: Histogram of dry matter intake (An_DMIn, kg/d) and days in milk (An_LactDays) 
    p3 <- ggplot(d, aes(x = An_DMIn, fill = as.factor(An_LactDays))) + geom_histogram() + theme(axis.text=element_text(size=18), axis.title=element_text(size=18,face="bold")) + theme(legend.text = element_text(colour="black", size=18),legend.title = element_text(color = "blue", size = 18)) +  labs(fill = "Days in Milk (d)")
    p3

    # Save plot
    library(png)
    ggsave(p1, file="Figure 1.jpeg", height= 11, width= 20, dpi= 400)
    ggsave(p2, file="Figure 2.jpeg", height= 11, width= 20, dpi= 400)
    ggsave(p3, file="Figure 3.jpeg", height= 11, width= 20, dpi= 400)
    
    # Remove remove outliers using the standard deviations  (do it to all candidate variables)
    # Variable DMI
    summary(d$An_DMIn)
    
    upper <- mean(d$An_DMIn) + (3*sd(d$An_DMIn))  # cutoff 3 standard deviations from the mean (apply the sd function)
    upper
    lower <- mean(d$An_DMIn) - (3*sd(d$An_DMIn))  # cutoff 3 standard deviations from the mean (apply the sd function)
    lower
    
    d <- d[which(d$An_DMIn > lower & d$An_DMIn < upper), ]
    summary(d$An_DMIn)

################### 4. Verify the final dataset  ################### 

    ls(d) # 649 observations and 24 variables

    # Make sure if there are missing values (NAs)
    summary(d)
    
    # Remove missing values, if necessary  # Code example: d <- d [!is.na(dataset$nameofvariable), ] 
    d <- d[!is.na(d$Obs_MilkPrtp), ] # d= 645 observations and 24 variables 
    summary(d)

#################################################################################################
#                                   Automated model selection 
################################################################################################# 
 
# Clean up your Console Ctrl+L 
graphics.off() # Delete all open graphics 
    
    
################### 1. Parallel Computing in RStudio ###########
library(parallel) # Enable multiple computations to take place at the same time.
                  # 
   # Find out how many cores your processor has
    num_cores <- (detectCores())		# All cores
    num_cores
    
   # Calculate the number of cores to use
    num_cores <- (detectCores() - 1)	
    num_cores
    
    # Initiate cluster
    cl <- makeCluster(3)	
    
    # Load packages on the cluster named cl
    clusterEvalQ(cl, library(lme4)) 
    clusterEvalQ(cl, library(MuMIn))
    
    # Load the dataset named d on the cluster
    clusterExport(cl, "d")
    
###################  2. Describe the global mixed model using all potential predictor variables ###########
    
    library(lme4) # run the library
    library(MuMIn) # run the library
    
    # Fit the Global Model (include all independent variables that potentially affect the DMI) 

    m1_global <- lmer(An_DMIn ~ Obs_MilkProd + An_LactDays + An_BW + Dt_Forage + Dt_St + Dt_FA +
                        (1|PubID), data=d, weights = sqrt(N_study), REML=FALSE) # Linear mixed model fit by maximum likelihood
    summary(m1_global)
    warnings()
   
################### 3. Derive a Set of Candidate Models ###################

    # Use the "dredge" function to generate models using combinations of the terms in the global model. 
    # The function will also calculate AICc values and rank models according to it. 
    # Note: the small size of the sample necessitates the use of AICc
    
    # Set up "NA action"
    options(na.action = "na.fail") 
    
    # Set up required independent variables
    ReqVars <- c("An_BW") # fixed = ReqVars
    
    # Use the dredge function to fit models
    allmods <- pdredge(m1_global, cluster = cl, rank = "AICc", trace=2, m.lim=c(0, 20))
    #allmods <- pdredge(m1_global, cluster = cl, rank = "AICc", fixed = "An_BW", trace=2, m.lim=c(0, 20))
    warnings()
    
    # Return the "NA action"
    options(na.action = "na.omit") 
    
    # Open the set of candidate models
    View(allmods)

    # Stop cluster after finish
    stopCluster(cl)     
        
################### 4. Collecting Results  ###################    

    # Create two datasets of candidate models       
    allmods <- allmods # all candidate models are collected
    Best20 <- head(allmods,20) # 20 models with the lowest AICc values are collected
    
    # Save the datasets in your working directory (computer)
    save(allmods, file="allmods df.Rda")
    save(Best20, file="Best20 df.Rda")

    # Save the dataset as .CSV n your working directory (computer)
    write.csv(allmods, file = "allmods.csv")
    write.csv(Best20, file = "Best20.csv")

    # Load the models from rda file
     # load(file = "allmods.rda")
     # head(allmods,20)

################### 5. Comparisons  ###################

    library(lme4) # Run the library
    library(MuMIn) # Run the library

    # Save coefficients in a table
    avgBest20 <- coefTable(Best20)

    # Models  delta < 5 (fitted ML as in the global model)
    avgmod <- summary(model.avg(get.models(Best20, subset=delta<5)))
    avgmod 

    # Collect and compare models using ANOVA
    basmod <- m1_global
    m1 <-get.models(Best20, subset = 1)[[1]]
    m2 <-get.models(Best20, subset = 2)[[1]]
    m3 <-get.models(Best20, subset = 3)[[1]]
    m4 <-get.models(Best20, subset = 4)[[1]]
    m5 <-get.models(Best20, subset = 5)[[1]]

    summary(m1) 
    summary(m2)
    summary(m3) 
    summary(m4)
    summary(m5) 

    anova(basmod, m1)	#  
    anova(basmod, m2)	#
    anova(basmod, m3)	# 
    anova(basmod, m4)	# 
    anova(basmod, m5)	# 

##################### 6. Collect Automatically the Models in a Table #####################################
    
    library(sjlabelled)
    library(sjmisc)
    library(sjstats)
    library(ggeffects)
    library(rlang)
    library(sjPlot)
    

    tab_model(m1, m2, m3, m4, show.se = TRUE, show.aic = TRUE, show.aicc = TRUE, show.ci = FALSE,
              show.r2 = FALSE, collapse.ci = FALSE,  show.std = FALSE, show.stat = TRUE, show.icc = FALSE,
              string.se = "SE", string.p = "P", digits = 3, dv.labels = c("Model 1", "Model 2","Model 3","Model 4"),
              col.order = c("est","se", "ci", "p"))
    
################### 7. Variance inflation factor (VIF) ###################

  # Reference: NANP Nutrition Models Workshop. 2017 ADSAŽ Annual. R.R. White, 2017.  
  # Link: https://animalnutrition.org/sites/default/files/2018-04/Lesson%206%20exercise%20R%20code.R

  # Function to return variance inflation factors
  VIF <- function (fit) {
  v <- vcov(fit)
  nam <- names(fixef(fit))
  ns <- sum(1 * (nam == "Intercept" | nam == "(Intercept)"))
  if (ns > 0) {
    v <- v[-(1:ns), -(1:ns), drop = FALSE]
    nam <- nam[-(1:ns)]
  }
  d <- diag(v)^0.5
  v <- diag(solve(v/(d %o% d)))
  names(v) <- nam
  return(v)
  }

  #Check VIF of the candidate models
  VIF(m1)
  VIF(m2)
  VIF(m3)
  VIF(m4)


##################### 8. Model Evaluation  #####################################
  
  library(epiR)
  library(MuMIn)

  # Run the function
  fitstat <- function(m) {
  m <- m
  o <- attributes(m)$frame[,1]
  p<-fitted(m)
  rand <- ranef(m)$PubID
  PubID <- attributes(ranef(m)$PubID)$row.names
  temp <- data.frame(PubID, rand)
  new <- merge(attributes(m)$frame, temp, by="PubID")
  p2 <- fitted(m)-new$X.Intercept.
  meano <- mean(o, na.rm=TRUE)
  meanp <- mean(p, na.rm=TRUE)
  res=o-p
  resad = o-p2
  res2=res^2
  resad2 = resad^2
  rm=sqrt(mean(res2, na.rm=TRUE));
  rmad=sqrt(mean(resad2, na.rm=TRUE))
  uss=sum(res2, na.rm=TRUE);
  lo <- ifelse(is.na(o)==FALSE, 1, 0)
  n=sum(lo);
  meanO=mean(o, na.rm=TRUE);
  mb=sum(res, na.rm=TRUE)/n;
  sse <- anova(lm(res~p))[2,2];
  msb <- mb^2;
  mspe <- rm^2;
  msre <- sse/n;
  msslope <- mspe-msre-msb;
  mean <- msb/mspe*100;
  slope <- msslope/mspe*100;
  residual <- msre/mspe*100;
  check <- mean+slope+residual
  rsr <- rmad/sd(o, na.rm=TRUE)
  ccc <- epi.ccc(o,p)$rho.c[1]
  cccad <- epi.ccc(o,p2)$rho.c[1]
  rmp = rm/meanp*100
  rmpad = rmad/mean(p2, na.rm=TRUE)*100
  mb <- mean(res, na.rm=TRUE)
  sb <- coef(lm(res~p))[2]
  aicc <- AICc(m)
  Shats <- attributes(VarCorr(m)$PubID)$stddev
  Shate <- attributes(VarCorr(m))$sc
  Shats_e <- Shats/Shate
  output <- format(c(meano, meanp, rm, rmad, rmp, rmpad, mean, slope, residual, check, mb, sb, rsr,ccc[,1],cccad[,1], aicc, Shats, Shate, Shats_e ), scientific=FALSE)
  labels <- c("Observed Mean", "Predicted Mean", "RMSE, units", "Unadjusted RMSE, units", "RMSE, % mean", "Unadjusted RMSE, % mean", "Mean Bias, % MSE", "Slope Bias, % MSE", "Residual Error", "Error Check", "Mean Bias", "Slope Bias", "RSR", "CCC", "Unadjusted CCC", "AICc", "Sigma Hat Study", "Sigma Hat Error", "Ratio of Sigma Hat Study/Error")
  out <- data.frame(labels,output)
  temp <- NULL
  new <- NULL
  return(out)
  }
  # Call Results
  fitstat(m1) # CCC=   Unadjusted RMSE, % mean= 
  fitstat(m2) # CCC=   Unadjusted RMSE, % mean=
  fitstat(m3) # CCC=   Unadjusted RMSE, % mean=
  fitstat(m4) # CCC=   Unadjusted RMSE, % mean=

  # Save Results 
  format(digits = 3)
  OutFitStm1 <- as.data.frame(fitstat(m1)) 
  OutFitStm2 <- as.data.frame(fitstat(m2)) 
  OutFitStm3 <- as.data.frame(fitstat(m3)) 
  OutFitStm4 <- as.data.frame(fitstat(m4))

  # Export File
  write.csv(OutFitStm1, file = "OutFitSt_Eqn1.csv")
  write.csv(OutFitStm2, file = "OutFitSt_Eqn2.csv")
  write.csv(OutFitStm3, file = "OutFitSt_Eqn3.csv")
  write.csv(OutFitStm4, file = "OutFitSt_Eqn4.csv")

  # Plots
  library(ggplot2)
  
  # Transform PubID as a factor
  d$PubID <- as.factor(d$PubID) 
  
  # Collect the observed value (o), the predicted value (p), and the residual (res)
  d$o <- d$An_DMIn
  d$p <- fitted(m1) 
  d$res <- d$o - d$p
  
  # Plot observed versus predicted values
  p4 <- ggplot(d, aes(x = p, y= o, colour = PubID)) + geom_point() + theme_set(theme_gray(base_size=18)) + xlab("Predicted DMI (kg/d)") + ylab("Observed DMI (kg/d)") + theme(legend.text = element_text(colour="black", size=18)) + theme(legend.text = element_text(colour="black", size=18)) + geom_abline(intercept = 0, slope = 1, color="black", size=0.5)
  p4
  
  # Residuals
  p5 <- ggplot(d, aes(x = p, y= res)) + geom_point() + theme_set(theme_gray(base_size=18)) + xlab("Predicted DMI (kg/d)") + ylab("Observed - Predicted DMI (kg/d)") + theme(legend.text = element_text(colour="black", size=18)) + theme(legend.text = element_text(colour="black", size=18)) + geom_hline(yintercept=0) +  theme(legend.position = "bottom")
  p5
  
  # Save plot
  library(png)
  ggsave(p4, file="Figure 4.jpeg", height= 11, width= 20, dpi= 400)
  ggsave(p5, file="Figure 5.jpeg", height= 11, width= 20, dpi= 400)
  
# Use the dataset to develop empirical models to predict the intake using different variables in the global model
# Use interactions or powers of the predictor variables.