#R Code for ADSA Example 1
#Updated 6.3.2018
#With Questions Contact:
#	Robin White
#	rrwhite@vt.edu
#	509-701-9290


#___________________________________________Open Packages_______________________________________________
library(ggplot2)
library(reshape2)
library(plyr)
library(lme4)
library(lmerTest)


#______________________________________Open Data From Working Directory___________________________________

# what is your working directory?
getwd()

# Move the data file to this location before running the next command
# You can also change the working directory using setwd("path/to/folder")
d <- data.frame(read.csv("exercise #3 data file.csv"))

#_______________________________________Visualize The Data______________________________________________
d1 <- d[c("TrtID" ,"Acmolp","Propmolp","Butmolp","BW","DaysInMilk","dietDM","dietCP","dietNDF",
          "dietADF","dietAsh","dietStarch","dietFat","dietNFC",
          "dietForageNDF","dietForage","DMIntake","MilkProd","MilkFatpct","MilkProtpct")]


mlt <- melt(d1, id="TrtID")
ggplot(mlt, aes(x=value))+geom_density()+facet_wrap(~variable, ncol=7, scales="free")

#_______________________________________Correct Outliers_________________________________________________
summary(d$BW)
d[d$BW>1000,] # The weird value is in study #6
d[d$Study==6,] # Look at the BW of the other animals in the trtID

#Assume this is an extra 5
d[d$TrtID==24,"BW"] <- 515
d[d$Study == 6,]

#_______________________________________Correct SEM Errors_______________________________________
SEM <- tapply(d$MeanRumenpHSEM, d$Model, mean, na.rm=TRUE)[2]
SEF <- tapply(d$MeanRumenpHSEM, d$Model, mean, na.rm=TRUE)[1]
d$StatSEM <- ifelse(d$Model=="M",  d$MeanRumenpHSEM/SEM, d$MeanRumenpHSEM/SEF)
d$Trunc <- ifelse(d$StatSEM<mean(d$StatSEM, na.rm=TRUE)*0.5,mean(d$StatSEM, na.rm=TRUE)*0.25,d$StatSEM)
d$MeanRumpH_wt <- 1/d$Trunc
pct <- ifelse(d$StatSEM<mean(d$StatSEM, na.rm=TRUE)*0.5,1,0)
nonna <- ifelse(is.na(d$StatSEM)==TRUE, 0, 1)
sum(pct, na.rm=TRUE)/sum(nonna, na.rm=TRUE)

#Reduce cutoff to prevent excessive curtailing
d$Trunc <- ifelse(d$StatSEM<mean(d$StatSEM, na.rm=TRUE)*0.4,mean(d$StatSEM, na.rm=TRUE)*0.4,d$StatSEM)
d$MeanRumpH_wt <- 1/d$Trunc
pct <- ifelse(d$StatSEM<mean(d$StatSEM, na.rm=TRUE)*0.4,1,0)
nonna <- ifelse(is.na(d$StatSEM)==TRUE, 0, 1)
sum(pct, na.rm=TRUE)/sum(nonna, na.rm=TRUE)


#______________________________________Making a summary table________________________________________
d2 <- d[,sapply(d, is.factor)==F]


bettersum <- function(d) {
  grid <- data.frame(id=NA, n=NA, mean=NA, sd=NA, min=NA, max=NA)
  for(i in 1:length(names(d))) {
    id <- names(d)[i]
    n <- ifelse(is.na(d[,i])==FALSE, 1,0)
    n <- sum(n)
    Mean <- mean(d[,i], na.rm=TRUE)
    SD <- sd(d[,i], na.rm=TRUE)
    min <- min(d[,i], na.rm=TRUE)
    max <- max(d[,i], na.rm=TRUE)
    row <- data.frame(id,n, Mean, SD, min, max)
    grid[i,] <- row
  }
  grid$id <- names(d)
  return(grid)
}
bettersum(d2)


#__________________________________________Derive Model, Phase 1_________________________________________________
#Include all variables
m1 <- lmer(MeanRumpH ~ BW + DaysInMilk +  dietDM + dietCP + dietNDF + dietADF +
             dietAsh + dietStarch + dietFat + dietNFC + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)

# Remove dietDM with p-value of 0.7659
m1 <- lmer(MeanRumpH ~ BW + DaysInMilk + dietCP + dietNDF + dietADF +
             dietAsh + dietStarch + dietFat + dietNFC + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)
# Remove DaysInMilk with p-value of 0.616887
m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF + dietADF +
             dietAsh + dietStarch + dietFat + dietNFC + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)
# Remove dietADF with p-value of 0.573144
m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF +
             dietAsh + dietStarch + dietFat + dietNFC + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)

#__________________________________________Re-test Dropped Parameters_________________________________________________
#Test DM again
m2 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF + dietDM +
             dietAsh + dietStarch + dietFat + dietNFC + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)
# Remove dietForageNDF with p-value of 0.54982
m2 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF + dietDM +
             dietAsh + dietStarch + dietFat + dietNFC + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)
#Test DIM again
m2 <- lmer(MeanRumpH ~ BW + DaysInMilk + dietCP + dietNDF + dietDM +
             dietAsh + dietStarch + dietFat + dietNFC + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)

#Test ADF again
m2 <- lmer(MeanRumpH ~ BW + dietADF + dietCP + dietNDF + dietDM +
             dietAsh + dietStarch + dietFat + dietNFC + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)

m2 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF + dietDM +
             dietAsh + dietStarch + dietFat + dietNFC + (1|Author), data = d, weights = MeanRumpH_wt)


#__________________________________________Check VIF_________________________________________________

#Make a function to return variance inflation factors
VIF <- function (fit) {
  v <- vcov(fit)
  nam <- names(fixef(fit))
  ns <- sum(1 * (nam == "Intercept" | nam == "(Intercept)"))
  if (ns > 0) {
    v <- v[-(1:ns), -(1:ns), drop = FALSE]
    nam <- nam[-(1:ns)]
  }
  d <- diag(v)^0.5
  v <- diag(solve(v/(d %o% d)))
  names(v) <- nam
  return(v)
}

#Check VIF of model 1
VIF(m1)

#Check VIF of model 2
VIF(m2)

# Remove dietNFC with VIF of 127
m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF +
             dietAsh + dietStarch + dietFat + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)
# Remove dietNDF with p-value = 0.862441
m1 <- lmer(MeanRumpH ~ BW + dietCP +
             dietAsh + dietStarch + dietFat + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)
# Remove dietFat with p-value = 0.7549
m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF +
             dietAsh + dietStarch + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)
# Remove Ash with p-value = 0.907
m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF +
             dietStarch + dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)
# Remove Starch with p-value = 0.907
m1 <- lmer(MeanRumpH ~ BW + dietCP + dietNDF +
             dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)
# Remove NDF with p-value = 0.699744
m1 <- lmer(MeanRumpH ~ BW + dietCP +
             dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m1)

#__________________________________________Re-test Dropped Parameters_________________________________________________
#Test DM again
m2 <- lmer(MeanRumpH ~ BW + dietCP + dietDM + 
                 dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)

#Test Fat again
m2 <- lmer(MeanRumpH ~ BW + dietCP +dietFat + 
             dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)

#Test ADF again
m2 <- lmer(MeanRumpH ~ BW + dietCP +dietADF + 
             dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)

# Drop BW with a p-value of 0.77978
m2 <- lmer(MeanRumpH ~  dietCP +dietADF + 
             dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)

# Drop ADF with a p-value of 0.12732
m2 <- lmer(MeanRumpH ~  dietCP + 
             dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)

# Drop CP with a p-value of 0.113
m2 <- lmer(MeanRumpH ~  
             dietForageNDF + (1|Author), data = d, weights = MeanRumpH_wt)
summary(m2)

VIF(m1)
VIF(m2)


#__________________________________________Check Model Fit_________________________________________________

#Generate an equation to estimate model fit statistics
RMSE <- function(m) {
  m <- m
  o <- attributes(m)$frame[,1]
  p<-fitted(m)
  rand <- ranef(m)$Author
  Author <- attributes(ranef(m)$Author)$row.names
  temp <- data.frame(Author, rand)
  new <- merge(attributes(m)$frame, temp, by="Author")
  p2 <- fitted(m)-new$X.Intercept.
  meano <- mean(o, na.rm=TRUE)
  meanp <- mean(p, na.rm=TRUE)
  res=o-p
  resad = o-p2
  res2=res^2
  resad2 = resad^2
  rm=sqrt(mean(res2, na.rm=TRUE));
  rmad=sqrt(mean(resad2, na.rm=TRUE))
  uss=sum(res2, na.rm=TRUE);
  lo <- ifelse(is.na(o)==FALSE, 1, 0)
  n=sum(lo);
  meanO=mean(o, na.rm=TRUE);
  mb=sum(res, na.rm=TRUE)/n;
  sse <- anova(lm(res~p))[2,2];
  msb <- mb^2;
  mspe <- rm^2;
  msre <- sse/n;
  msslope <- mspe-msre-msb;
  mean <- msb/mspe*100;
  slope <- msslope/mspe*100;
  residual <- msre/mspe*100;
  check <- mean+slope+residual
  rsr <- rmad/sd(o, na.rm=TRUE)
  ccc <- epi.ccc(o,p)$rho.c[1]
  cccad <- epi.ccc(o,p2)$rho.c[1]
  rmp = rm/meano*100
  rmpad = rmad/mean(p2, na.rm=TRUE)*100
  mb <- mean(res, na.rm=TRUE)
  sb <- coef(lm(res~p))[2]
  aicc <- AICc(m)
  Shats <- attributes(VarCorr(m)$Author)$stddev
  Shate <- attributes(VarCorr(m))$sc
  Shats_e <- Shats/Shate
  output <- format(c(n,meano, meanp, rm, rmad, rmp, rmpad, mean, slope, residual, check, mb, sb, rsr,ccc[,1],cccad[,1], aicc, Shats, Shate, Shats_e ), scientific=FALSE)
  labels <- c("N", "Observed Mean", "Predicted Mean", "RMSE, units", "Unadjusted RMSE, units", "RMSE, % mean", "Unadjusted RMSE, % mean", "Mean Bias, % MSE", "Slope Bias, % MSE", "Residual Error", "Error Check", "Mean Bias", "Slope Bias", "RSR", "CCC", "Unadjusted CCC", "AICc", "Sigma Hat Study", "Sigma Hat Error", "Ratio of Sigma Hat Study/Error")
  out <- data.frame(labels,output)
  temp <- NULL
  new <- NULL
  return(out)
}

#Open additional required packages
library(epiR)
library(MuMIn)


#Output fit statistics on m1
RMSE(m1)

#Output fit statistics on m2
RMSE(m2)

# ________________ But we are testing on the training data _________________________
# ________________ Let's try cross validation schemes ______________________________
library(caret)


# K-folds CV
k = 10

df2 <- d[complete.cases(d$MeanRumpH),]

values <- df2$MeanRumpH
variables <- df2[,c("TrtID" ,"Acmolp","Propmolp","Butmolp","BW","DaysInMilk","dietDM","dietCP","dietNDF",
                    "dietADF","dietAsh","dietStarch","dietFat","dietNFC",
                    "dietForageNDF","dietForage","DMIntake","MilkProd","MilkFatpct","MilkProtpct")]

folds <- createFolds(values, k = k)
output <- data.frame(fold = NA, m1_rmse = NA, m2_rmse = NA)

for(i in 1:length(folds)) {
  train <- df2[-unlist(folds[i]),]
  test <- df2[unlist(folds[i]),]
  m1 <- lmer(MeanRumpH ~ BW + dietCP +
         dietForageNDF + (1|Author), data = train, weights = MeanRumpH_wt)
  m2 <- lmer(MeanRumpH ~  
         dietForageNDF + (1|Author), data = train, weights = MeanRumpH_wt)
  m1_rmse = sqrt(mean((test$MeanRumpH - predict(m1, test, allow.new.levels = TRUE))^2, na.rm = T))
  m2_rmse = sqrt(mean((test$MeanRumpH - predict(m2, test, allow.new.levels = TRUE))^2, na.rm = T))
  row = data.frame(fold = i, m1_rmse, m2_rmse)
  output[i,] <- row
}

print(colMeans(output[,2:3], na.rm = T))


# Leave one out CV
df2 <- d[complete.cases(d$MeanRumpH),]

values <- df2$MeanRumpH
variables <- df2[,c("TrtID" ,"Acmolp","Propmolp","Butmolp","BW","DaysInMilk","dietDM","dietCP","dietNDF",
                    "dietADF","dietAsh","dietStarch","dietFat","dietNFC",
                    "dietForageNDF","dietForage","DMIntake","MilkProd","MilkFatpct","MilkProtpct")]

folds <- nrow(df2)
output <- data.frame(fold = NA, m1_rmse = NA, m2_rmse = NA)

for(i in 1:folds) {
  train <- df2[-i,]
  test <- df2[i,]
  m1 <- lmer(MeanRumpH ~ BW + dietCP +
               dietForageNDF + (1|Author), data = train, weights = MeanRumpH_wt)
  m2 <- lmer(MeanRumpH ~  
               dietForageNDF + (1|Author), data = train, weights = MeanRumpH_wt)
  m1_rmse = sqrt(mean((test$MeanRumpH - predict(m1, test, allow.new.levels = TRUE))^2, na.rm = T))
  m2_rmse = sqrt(mean((test$MeanRumpH - predict(m2, test, allow.new.levels = TRUE))^2, na.rm = T))
  row = data.frame(fold = i, m1_rmse, m2_rmse)
  output[i,] <- row
}

print(colMeans(output[,2:3], na.rm = T))