################################# NANP Nutrition Models Workshop ################################# # Automated model selection (AMS): Part II (exercises) # Dr. Veridiana L. Daley Email: veridi7@vt.edu # Objective: Apply the AMS approach as a tool for the selection of prediction models. # The dataset used in this lesson can be downloaded from the NANP website (Dairy digestion, https://animalnutrition.org/modeling-database) # Tips: # Use # to make comments (green letters). # In the script the codes are in black and blue. # In the console the codes are in blue and results in black. # Executing code (two options): #1. Select the code and press the Ctrl+Enter. #2. Select the code and press the Run toolbar button (on the top right side of this script). # Clear the Console press Ctrl+L # Restart the RStudio session Ctrl+Shift+F10 ################################ Before the Workshop ############################################# # 1. Install the packages (only run the code below if you DID NOT install the libraries previously) # install.packages("nameofthepackage") install.packages("readr") install.packages("tidyverse") install.packages("psych") install.packages("reshape2") install.packages("gglot2") install.packages("png") install.packages("parallel") install.packages("lme4") install.packages("MuMIn") install.packages("sjPlot") install.packages("epiR") install.packages("sjlabelled") install.packages("sjmisc") install.packages("sjstats") install.packages("ggeffects") install.packages("sjPlot") install.packages("rlang") # If you want to unload a package because of an error, use the code below and then install the package again # detach("package:nameofthepackage") # If you are a new RStudio user, please read the R manual on the NANP website # Link: https://animalnutrition.org/sites/default/files/2018-06/Installing%20and%20using%20R%20updated%207.26.2017.pdf ################################ Lesson 4 ######################################################### ################ 1. Import Dataset ################ # Clean up your environment on RStudio. rm(list = ls(all = TRUE)) # Delete all R object graphics.off() # Delete all open graphics # Set your working directory # Option A: Set your working directory: Ctrl+Shift+H > Go to USB drive > Choose the folder "Lesson4"> Select Open. # Option B: Go to Session > Set Working Directory > Choose directory > Computer > Removable disk (USB drive) > Lesson4 > Open. # Example: setwd(" ") # *Fill out with your Working Directory information, you can copy it and paste from Console # Read the data (CSV file) library(readr) # Load the library necessary to open the data.CSV file (make sure that this library was installed previously, and run the code twice to make sure that it is in blue color on the Console) # Option A: Load the dataset named "data" (from your USB drive) data <- read_csv("Exercise #3 data file.csv") # 658 observations and 26 variables (see it at the "Global Environment" on the top right of the your screen)) # Option B: Go to "Global Environment" > Import Dataset > From Text (readr) > File > data.CSV (USB drive) > Open > Import ################ 2.Data Cleaning ################### # Show the names of all variables ls(data) # Summary of the dataset summary(data) # Rename an observation with misspelling case View(data) data$An_BW <- ifelse(data$An_BW == 58300, 583, data$An_BW) # Remove studies (PubIDS) with missing BW data data <- subset(data, data$PubID != 2 & data$PubID != 4) # Remove variables ls(data) data$X1 <- NULL data$TrtName <- NULL data$Obs_MilkLacp <- NULL # Add new variables to a data frame data$An_BWm <- data$An_BW ^ 0.75 # calculate metabolic body weight # Create a copy of the database and name it as "d" d <- data # See two data.frame on the Global Environment # Check column classes and changing it to "numeric" if necessary class(d$An_BW) # one variable sapply(d, class) # all variables # If necessary to change the variable class to numeric, use the code example: data$variablename <- as.numeric(data$variablename) # If necessary to change the variable class to character, use the code example: data$variablename <- as.character(data$variablename) ################### 3.Plots and Remove Outliers ################### library(reshape2) # Run the library library(ggplot2) # Run the library # Plot 1: Plot of data to indentify outliers mlt <- d[c("PubID", "TID", "An_DMIn", "An_BW", "An_LactDays", "Dt_ADF")] # fill out with all variables of the dataset mlt <- melt(mlt, id="TID") p1 <- ggplot(mlt, aes(x=value))+geom_density(alpha = 0.2)+facet_wrap(~variable, ncol=6, scales="free") + theme(strip.text = element_text(size = 20, color = "white", hjust = 0.5,vjust = 0.5, face = "bold"), strip.background = element_rect( fill = "deepskyblue4", color = NA )) + theme(axis.text=element_text(size=15), axis.title=element_text(size=15,face="bold")) p1 # Plot 2: Histogram DMI in kg/day p2 <- ggplot(d, aes(An_DMIn)) + geom_histogram(color="red", fill="red", alpha = .5, position = "identity") + theme(axis.text=element_text(size=18), axis.title=element_text(size=18,face="bold")) p2 # Plot 3: Histogram of dry matter intake (An_DMIn, kg/d) and days in milk (An_LactDays) p3 <- ggplot(d, aes(x = An_DMIn, fill = as.factor(An_LactDays))) + geom_histogram() + theme(axis.text=element_text(size=18), axis.title=element_text(size=18,face="bold")) + theme(legend.text = element_text(colour="black", size=18),legend.title = element_text(color = "blue", size = 18)) + labs(fill = "Days in Milk (d)") p3 # Save plot library(png) ggsave(p1, file="Figure 1.jpeg", height= 11, width= 20, dpi= 400) ggsave(p2, file="Figure 2.jpeg", height= 11, width= 20, dpi= 400) ggsave(p3, file="Figure 3.jpeg", height= 11, width= 20, dpi= 400) # Remove remove outliers using the standard deviations (do it to all candidate variables) # Variable DMI summary(d$An_DMIn) upper <- mean(d$An_DMIn) + (3*sd(d$An_DMIn)) # cutoff 3 standard deviations from the mean (apply the sd function) upper lower <- mean(d$An_DMIn) - (3*sd(d$An_DMIn)) # cutoff 3 standard deviations from the mean (apply the sd function) lower d <- d[which(d$An_DMIn > lower & d$An_DMIn < upper), ] summary(d$An_DMIn) ################### 4. Verify the final dataset ################### ls(d) # 649 observations and 24 variables # Make sure if there are missing values (NAs) summary(d) # Remove missing values, if necessary # Code example: d <- d [!is.na(dataset$nameofvariable), ] d <- d[!is.na(d$Obs_MilkPrtp), ] # d= 645 observations and 24 variables summary(d) ################################################################################################# # Automated model selection ################################################################################################# # Clean up your Console Ctrl+L graphics.off() # Delete all open graphics ################### 1. Parallel Computing in RStudio ########### library(parallel) # Enable multiple computations to take place at the same time. # # Find out how many cores your processor has num_cores <- (detectCores()) # All cores num_cores # Calculate the number of cores to use num_cores <- (detectCores() - 1) num_cores # Initiate cluster cl <- makeCluster(3) # Load packages on the cluster named cl clusterEvalQ(cl, library(lme4)) clusterEvalQ(cl, library(MuMIn)) # Load the dataset named d on the cluster clusterExport(cl, "d") ################### 2. Describe the global mixed model using all potential predictor variables ########### library(lme4) # run the library library(MuMIn) # run the library # Fit the Global Model (include all independent variables that potentially affect the DMI) m1_global <- lmer(An_DMIn ~ Obs_MilkProd + An_LactDays + An_BW + Dt_Forage + Dt_St + Dt_FA + (1|PubID), data=d, weights = sqrt(N_study), REML=FALSE) # Linear mixed model fit by maximum likelihood summary(m1_global) warnings() ################### 3. Derive a Set of Candidate Models ################### # Use the "dredge" function to generate models using combinations of the terms in the global model. # The function will also calculate AICc values and rank models according to it. # Note: the small size of the sample necessitates the use of AICc # Set up "NA action" options(na.action = "na.fail") # Set up required independent variables ReqVars <- c("An_BW") # fixed = ReqVars # Use the dredge function to fit models allmods <- pdredge(m1_global, cluster = cl, rank = "AICc", trace=2, m.lim=c(0, 20)) #allmods <- pdredge(m1_global, cluster = cl, rank = "AICc", fixed = "An_BW", trace=2, m.lim=c(0, 20)) warnings() # Return the "NA action" options(na.action = "na.omit") # Open the set of candidate models View(allmods) # Stop cluster after finish stopCluster(cl) ################### 4. Collecting Results ################### # Create two datasets of candidate models allmods <- allmods # all candidate models are collected Best20 <- head(allmods,20) # 20 models with the lowest AICc values are collected # Save the datasets in your working directory (computer) save(allmods, file="allmods df.Rda") save(Best20, file="Best20 df.Rda") # Save the dataset as .CSV n your working directory (computer) write.csv(allmods, file = "allmods.csv") write.csv(Best20, file = "Best20.csv") # Load the models from rda file # load(file = "allmods.rda") # head(allmods,20) ################### 5. Comparisons ################### library(lme4) # Run the library library(MuMIn) # Run the library # Save coefficients in a table avgBest20 <- coefTable(Best20) # Models delta < 5 (fitted ML as in the global model) avgmod <- summary(model.avg(get.models(Best20, subset=delta<5))) avgmod # Collect and compare models using ANOVA basmod <- m1_global m1 <-get.models(Best20, subset = 1)[[1]] m2 <-get.models(Best20, subset = 2)[[1]] m3 <-get.models(Best20, subset = 3)[[1]] m4 <-get.models(Best20, subset = 4)[[1]] m5 <-get.models(Best20, subset = 5)[[1]] summary(m1) summary(m2) summary(m3) summary(m4) summary(m5) anova(basmod, m1) # anova(basmod, m2) # anova(basmod, m3) # anova(basmod, m4) # anova(basmod, m5) # ##################### 6. Collect Automatically the Models in a Table ##################################### library(sjlabelled) library(sjmisc) library(sjstats) library(ggeffects) library(rlang) library(sjPlot) tab_model(m1, m2, m3, m4, show.se = TRUE, show.aic = TRUE, show.aicc = TRUE, show.ci = FALSE, show.r2 = FALSE, collapse.ci = FALSE, show.std = FALSE, show.stat = TRUE, show.icc = FALSE, string.se = "SE", string.p = "P", digits = 3, dv.labels = c("Model 1", "Model 2","Model 3","Model 4"), col.order = c("est","se", "ci", "p")) ################### 7. Variance inflation factor (VIF) ################### # Reference: NANP Nutrition Models Workshop. 2017 ADSAŽ Annual. R.R. White, 2017. # Link: https://animalnutrition.org/sites/default/files/2018-04/Lesson%206%20exercise%20R%20code.R # Function to return variance inflation factors VIF <- function (fit) { v <- vcov(fit) nam <- names(fixef(fit)) ns <- sum(1 * (nam == "Intercept" | nam == "(Intercept)")) if (ns > 0) { v <- v[-(1:ns), -(1:ns), drop = FALSE] nam <- nam[-(1:ns)] } d <- diag(v)^0.5 v <- diag(solve(v/(d %o% d))) names(v) <- nam return(v) } #Check VIF of the candidate models VIF(m1) VIF(m2) VIF(m3) VIF(m4) ##################### 8. Model Evaluation ##################################### library(epiR) library(MuMIn) # Run the function fitstat <- function(m) { m <- m o <- attributes(m)$frame[,1] p<-fitted(m) rand <- ranef(m)$PubID PubID <- attributes(ranef(m)$PubID)$row.names temp <- data.frame(PubID, rand) new <- merge(attributes(m)$frame, temp, by="PubID") p2 <- fitted(m)-new$X.Intercept. meano <- mean(o, na.rm=TRUE) meanp <- mean(p, na.rm=TRUE) res=o-p resad = o-p2 res2=res^2 resad2 = resad^2 rm=sqrt(mean(res2, na.rm=TRUE)); rmad=sqrt(mean(resad2, na.rm=TRUE)) uss=sum(res2, na.rm=TRUE); lo <- ifelse(is.na(o)==FALSE, 1, 0) n=sum(lo); meanO=mean(o, na.rm=TRUE); mb=sum(res, na.rm=TRUE)/n; sse <- anova(lm(res~p))[2,2]; msb <- mb^2; mspe <- rm^2; msre <- sse/n; msslope <- mspe-msre-msb; mean <- msb/mspe*100; slope <- msslope/mspe*100; residual <- msre/mspe*100; check <- mean+slope+residual rsr <- rmad/sd(o, na.rm=TRUE) ccc <- epi.ccc(o,p)$rho.c[1] cccad <- epi.ccc(o,p2)$rho.c[1] rmp = rm/meanp*100 rmpad = rmad/mean(p2, na.rm=TRUE)*100 mb <- mean(res, na.rm=TRUE) sb <- coef(lm(res~p))[2] aicc <- AICc(m) Shats <- attributes(VarCorr(m)$PubID)$stddev Shate <- attributes(VarCorr(m))$sc Shats_e <- Shats/Shate output <- format(c(meano, meanp, rm, rmad, rmp, rmpad, mean, slope, residual, check, mb, sb, rsr,ccc[,1],cccad[,1], aicc, Shats, Shate, Shats_e ), scientific=FALSE) labels <- c("Observed Mean", "Predicted Mean", "RMSE, units", "Unadjusted RMSE, units", "RMSE, % mean", "Unadjusted RMSE, % mean", "Mean Bias, % MSE", "Slope Bias, % MSE", "Residual Error", "Error Check", "Mean Bias", "Slope Bias", "RSR", "CCC", "Unadjusted CCC", "AICc", "Sigma Hat Study", "Sigma Hat Error", "Ratio of Sigma Hat Study/Error") out <- data.frame(labels,output) temp <- NULL new <- NULL return(out) } # Call Results fitstat(m1) # CCC= Unadjusted RMSE, % mean= fitstat(m2) # CCC= Unadjusted RMSE, % mean= fitstat(m3) # CCC= Unadjusted RMSE, % mean= fitstat(m4) # CCC= Unadjusted RMSE, % mean= # Save Results format(digits = 3) OutFitStm1 <- as.data.frame(fitstat(m1)) OutFitStm2 <- as.data.frame(fitstat(m2)) OutFitStm3 <- as.data.frame(fitstat(m3)) OutFitStm4 <- as.data.frame(fitstat(m4)) # Export File write.csv(OutFitStm1, file = "OutFitSt_Eqn1.csv") write.csv(OutFitStm2, file = "OutFitSt_Eqn2.csv") write.csv(OutFitStm3, file = "OutFitSt_Eqn3.csv") write.csv(OutFitStm4, file = "OutFitSt_Eqn4.csv") # Plots library(ggplot2) # Transform PubID as a factor d$PubID <- as.factor(d$PubID) # Collect the observed value (o), the predicted value (p), and the residual (res) d$o <- d$An_DMIn d$p <- fitted(m1) d$res <- d$o - d$p # Plot observed versus predicted values p4 <- ggplot(d, aes(x = p, y= o, colour = PubID)) + geom_point() + theme_set(theme_gray(base_size=18)) + xlab("Predicted DMI (kg/d)") + ylab("Observed DMI (kg/d)") + theme(legend.text = element_text(colour="black", size=18)) + theme(legend.text = element_text(colour="black", size=18)) + geom_abline(intercept = 0, slope = 1, color="black", size=0.5) p4 # Residuals p5 <- ggplot(d, aes(x = p, y= res)) + geom_point() + theme_set(theme_gray(base_size=18)) + xlab("Predicted DMI (kg/d)") + ylab("Observed - Predicted DMI (kg/d)") + theme(legend.text = element_text(colour="black", size=18)) + theme(legend.text = element_text(colour="black", size=18)) + geom_hline(yintercept=0) + theme(legend.position = "bottom") p5 # Save plot library(png) ggsave(p4, file="Figure 4.jpeg", height= 11, width= 20, dpi= 400) ggsave(p5, file="Figure 5.jpeg", height= 11, width= 20, dpi= 400) # Use the dataset to develop empirical models to predict the intake using different variables in the global model # Use interactions or powers of the predictor variables.