#R Code for ADSA Example 1 #Updated 6.3.2018 #With Questions Contact: # Robin White # rrwhite@vt.edu # 509-701-9290 #___________________________________________Open Packages_______________________________________________ library(ggplot2) library(reshape2) library(plyr) library(lme4) library(lmerTest) #______________________________________Open Data From Working Directory___________________________________ # what is your working directory? getwd() # Move the data file to this location before running the next command # You can also change the working directory using setwd("path/to/folder") d <- data.frame(read.csv("exercise #3 data file.csv")) #_______________________________________Visualize The Data______________________________________________ d1 <- d[c("TrtID" ,"Acmolp","Propmolp","Butmolp","BW","DaysInMilk","dietDM","dietCP","dietNDF", "dietADF","dietAsh","dietStarch","dietFat","dietNFC", "dietForageNDF","dietForage","DMIntake","MilkProd","MilkFatpct","MilkProtpct")] mlt <- melt(d1, id="TrtID") ggplot(mlt, aes(x=value))+geom_density()+facet_wrap(~variable, ncol=7, scales="free") #_______________________________________Correct Outliers_________________________________________________ summary(d$BW) d[d$BW>1000,] # The weird value is in study #6 d[d$Study==6,] # Look at the BW of the other animals in the trtID #Assume this is an extra 5 d[d$TrtID==24,"BW"] <- 515 d[d$Study == 6,] #_______________________________________Correct SEM Errors_______________________________________ SEM <- tapply(d$MeanRumenpHSEM, d$Model, mean, na.rm=TRUE)[2] SEF <- tapply(d$MeanRumenpHSEM, d$Model, mean, na.rm=TRUE)[1] d$StatSEM <- ifelse(d$Model=="M", d$MeanRumenpHSEM/SEM, d$MeanRumenpHSEM/SEF) d$Trunc <- ifelse(d$StatSEM 0) { v <- v[-(1:ns), -(1:ns), drop = FALSE] nam <- nam[-(1:ns)] } d <- diag(v)^0.5 v <- diag(solve(v/(d %o% d))) names(v) <- nam return(v) } #Check VIF of model 1 VIF(m1) #Check VIF of model 2 VIF(m2) # Remove dietNFC with VIF of 127 m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF + dietAsh + dietStarch + dietFat + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m1) # Remove dietNDF with p-value = 0.862441 m1 <- lmer(MeanRumpH ~ BW + dietCP + dietAsh + dietStarch + dietFat + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m1) # Remove dietFat with p-value = 0.7549 m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF + dietAsh + dietStarch + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m1) # Remove Ash with p-value = 0.907 m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF + dietStarch + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m1) # Remove Starch with p-value = 0.907 m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m1) # Remove NDF with p-value = 0.699744 m1 <- lmer(MeanRumpH ~ BW + dietCP + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m1) #__________________________________________Re-test Dropped Parameters_________________________________________________ #Test DM again m2 <- lmer(MeanRumpH ~ BW + dietCP + dietDM + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m2) #Test Fat again m2 <- lmer(MeanRumpH ~ BW + dietCP +dietFat + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m2) #Test ADF again m2 <- lmer(MeanRumpH ~ BW + dietCP +dietADF + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m2) # Drop BW with a p-value of 0.77978 m2 <- lmer(MeanRumpH ~ dietCP +dietADF + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m2) # Drop ADF with a p-value of 0.12732 m2 <- lmer(MeanRumpH ~ dietCP + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m2) # Drop CP with a p-value of 0.113 m2 <- lmer(MeanRumpH ~ dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt) summary(m2) VIF(m1) VIF(m2) #__________________________________________Check Model Fit_________________________________________________ #Generate an equation to estimate model fit statistics RMSE <- function(m) { m <- m o <- attributes(m)$frame[,1] p<-fitted(m) rand <- ranef(m)$Author Author <- attributes(ranef(m)$Author)$row.names temp <- data.frame(Author, rand) new <- merge(attributes(m)$frame, temp, by="Author") p2 <- fitted(m)-new$X.Intercept. meano <- mean(o, na.rm=TRUE) meanp <- mean(p, na.rm=TRUE) res=o-p resad = o-p2 res2=res^2 resad2 = resad^2 rm=sqrt(mean(res2, na.rm=TRUE)); rmad=sqrt(mean(resad2, na.rm=TRUE)) uss=sum(res2, na.rm=TRUE); lo <- ifelse(is.na(o)==FALSE, 1, 0) n=sum(lo); meanO=mean(o, na.rm=TRUE); mb=sum(res, na.rm=TRUE)/n; sse <- anova(lm(res~p))[2,2]; msb <- mb^2; mspe <- rm^2; msre <- sse/n; msslope <- mspe-msre-msb; mean <- msb/mspe*100; slope <- msslope/mspe*100; residual <- msre/mspe*100; check <- mean+slope+residual rsr <- rmad/sd(o, na.rm=TRUE) ccc <- epi.ccc(o,p)$rho.c[1] cccad <- epi.ccc(o,p2)$rho.c[1] rmp = rm/meano*100 rmpad = rmad/mean(p2, na.rm=TRUE)*100 mb <- mean(res, na.rm=TRUE) sb <- coef(lm(res~p))[2] aicc <- AICc(m) Shats <- attributes(VarCorr(m)$Author)$stddev Shate <- attributes(VarCorr(m))$sc Shats_e <- Shats/Shate output <- format(c(n,meano, meanp, rm, rmad, rmp, rmpad, mean, slope, residual, check, mb, sb, rsr,ccc[,1],cccad[,1], aicc, Shats, Shate, Shats_e ), scientific=FALSE) labels <- c("N", "Observed Mean", "Predicted Mean", "RMSE, units", "Unadjusted RMSE, units", "RMSE, % mean", "Unadjusted RMSE, % mean", "Mean Bias, % MSE", "Slope Bias, % MSE", "Residual Error", "Error Check", "Mean Bias", "Slope Bias", "RSR", "CCC", "Unadjusted CCC", "AICc", "Sigma Hat Study", "Sigma Hat Error", "Ratio of Sigma Hat Study/Error") out <- data.frame(labels,output) temp <- NULL new <- NULL return(out) } #Open additional required packages library(epiR) library(MuMIn) #Output fit statistics on m1 RMSE(m1) #Output fit statistics on m2 RMSE(m2) # ________________ But we are testing on the training data _________________________ # ________________ Let's try cross validation schemes ______________________________ library(caret) # K-folds CV k = 10 df2 <- d[complete.cases(d$MeanRumpH),] values <- df2$MeanRumpH variables <- df2[,c("TrtID" ,"Acmolp","Propmolp","Butmolp","BW","DaysInMilk","dietDM","dietCP","dietNDF", "dietADF","dietAsh","dietStarch","dietFat","dietNFC", "dietForageNDF","dietForage","DMIntake","MilkProd","MilkFatpct","MilkProtpct")] folds <- createFolds(values, k = k) output <- data.frame(fold = NA, m1_rmse = NA, m2_rmse = NA) for(i in 1:length(folds)) { train <- df2[-unlist(folds[i]),] test <- df2[unlist(folds[i]),] m1 <- lmer(MeanRumpH ~ BW + dietCP + dietForageNDF + (1|Author), data = train, weights = MeanRumpH_wt) m2 <- lmer(MeanRumpH ~ dietForageNDF + (1|Author), data = train, weights = MeanRumpH_wt) m1_rmse = sqrt(mean((test$MeanRumpH - predict(m1, test, allow.new.levels = TRUE))^2, na.rm = T)) m2_rmse = sqrt(mean((test$MeanRumpH - predict(m2, test, allow.new.levels = TRUE))^2, na.rm = T)) row = data.frame(fold = i, m1_rmse, m2_rmse) output[i,] <- row } print(colMeans(output[,2:3], na.rm = T)) # Leave one out CV df2 <- d[complete.cases(d$MeanRumpH),] values <- df2$MeanRumpH variables <- df2[,c("TrtID" ,"Acmolp","Propmolp","Butmolp","BW","DaysInMilk","dietDM","dietCP","dietNDF", "dietADF","dietAsh","dietStarch","dietFat","dietNFC", "dietForageNDF","dietForage","DMIntake","MilkProd","MilkFatpct","MilkProtpct")] folds <- nrow(df2) output <- data.frame(fold = NA, m1_rmse = NA, m2_rmse = NA) for(i in 1:folds) { train <- df2[-i,] test <- df2[i,] m1 <- lmer(MeanRumpH ~ BW + dietCP + dietForageNDF + (1|Author), data = train, weights = MeanRumpH_wt) m2 <- lmer(MeanRumpH ~ dietForageNDF + (1|Author), data = train, weights = MeanRumpH_wt) m1_rmse = sqrt(mean((test$MeanRumpH - predict(m1, test, allow.new.levels = TRUE))^2, na.rm = T)) m2_rmse = sqrt(mean((test$MeanRumpH - predict(m2, test, allow.new.levels = TRUE))^2, na.rm = T)) row = data.frame(fold = i, m1_rmse, m2_rmse) output[i,] <- row } print(colMeans(output[,2:3], na.rm = T))