#####################
# GS using Bayesian GE models
#####################
# Loading the phenotypes
pheno <- readRDS("pheno2")
head(pheno)
dim(pheno)

# removing the parents
pheno <- droplevels.data.frame(pheno[pheno$type == "sc",])
head(pheno)
tail(pheno)
dim(pheno)

# adjusting models per enviroment
library(breedR)

LN <- remlf90(fixed = MPSadj ~ gid, random = ~ rep, method = "em", data = pheno[pheno$N == "low",])
mean(pheno[pheno$N == "low", "MPSadj"])

IN <- remlf90(fixed = MPSadj ~ gid, random = ~ rep, method = "em", data = pheno[pheno$N == "ideal",])
mean(pheno[pheno$N == "ideal", "MPSadj"])

# joint the data but adding the mean  by env
phenoGE <- data.frame(env = c(rep("LN", dim(LN$fixed[[1]][[1]])[1]), rep("IN", dim(IN$fixed[[1]][[1]])[1])), gid = c(rownames(LN$fixed[[1]][[1]]), rownames(IN$fixed[[1]][[1]])), MPS = c((LN$fixed[[1]][[1]][,1] + mean(pheno[pheno$N == "low", "MPSadj"])), (IN$fixed[[1]][[1]][,1] + mean(pheno[pheno$N == "ideal", "MPSadj"]))))

head(phenoGE)                      
tail(phenoGE)
str(phenoGE)                      

# loading the marker file
M <- readRDS("M")
dim(M)
# selecting just the hybrids
match(levels(pheno$gid), rownames(M))
M <- M[match(levels(pheno$gid), rownames(M)),]
dim(M)
head(M[,1:6])
tail(M[,1:6])

all(phenoGE$gid %in% rownames(M))
all(rownames(M) %in% phenoGE$gid)


#######################################
#===== Multi-Environment models ======#
#######################################
# Packages installation
# install.packages("devtools")
# devtools::install_github('italo-granato/BGGE')
# install.packages("MCMCpack")
# devtools::install_github('QuantGen/MTM')

library(BGGE)
args(getK)
# getK - function to create kernels for different genomic GE models


##################################
# Main genotypic effect model (MM)
##################################
#It assumes that genetic effects across environments are constant
?BGGE
K.MMGB <- getK(Y = phenoGE, X = M, kernel = "GB", model = "MM")
str(K.MMGB)


# Prediction
# BGGE - function that makes predictions
args(BGGE)

(ne <- as.vector(table(phenoGE$env))) # number of genotypes by environmens

# running using the whole data
fitMMGB <- BGGE(y = phenoGE[,3], K = K.MMGB, ne = ne, ite = 2000, burn = 200, thin = 5)
names(fitMMGB)


# PCA of kernel
dim(K.MMGB$G$Kernel)
K.MMGB$G$Kernel[1:5,1:5]
eig.result <- eigen(K.MMGB$G$Kernel)
lambda <- eig.result$values
length(lambda)
plot(lambda/sum(lambda), ylab = "Fraction Explained")
PCA <- eig.result$vectors
dim(PCA)
PCA[1:5, 1:5]
XF <- cbind(rep(1, nrow(PCA)), PCA[,1])
rownames(XF) <- colnames(K.MMGB$G$Kernel) 
colnames(XF) <- c("Bo", "PCA1")
class(XF)
head(XF)

# running again but adding a fixed effect 
fitMMGB <- BGGE(y = phenoGE[,3],  XF = XF,  K = K.MMGB, ne = ne, ite = 2000, burn = 200, thin = 5)

fitMMGB$yHat
fitMMGB$varE

#######

# definig paremeters to simulate missing values
IDs <- levels(phenoGE$gid) #unique genotypes
IDy <- as.vector(phenoGE$gid) #vector of genotypes
nFolds <- 5 # n of groups
replicates <- 10 # boostrap method
env <- as.vector(unique(phenoGE$env))


########################################################################################
# Cross validation 1 (CV1) - Some genotypes were not observed in any environment
########################################################################################

# Simulating missing values
mfold <- matrix(NA, ncol = replicates, nrow = length(IDs))
sets1 <- matrix(NA, nrow = nrow(phenoGE), ncol = replicates)

set.seed(29121983)
for (s in 1:replicates) {
  mfold[, s] <- sample(rep(seq(1:nFolds), length.out = length(IDs)))
  for (i in 1:nrow(phenoGE)) {
    sets1[i, s] <- mfold[which(IDs == phenoGE$gid[i]), s]
  }
}

colnames(sets1) <- paste("rep", 1:ncol(sets1), sep = "")

sets1

# Cheking missing for genotypes
yNA <- phenoGE
yNA[,3][sets1[,1] == 1] <- NA
library(reshape2)
tmp <- acast(data = yNA, gid ~ env, value.var = "MPS") #transforming to env in columns
head(tmp, 10) # showing the 10 first


########################################################################################
# Cross validation 2 (CV2) - Genotypes were tested in some environments but not in other
########################################################################################

# Simulating missing values
sets2 <- matrix(NA, nrow = nrow(phenoGE), ncol = replicates)

set.seed(29121983)
for (s in 1:replicates) {
  sets2[, s] <- sample(rep(seq(1:nFolds), length.out = 2*length(IDs)))
}

colnames(sets2) <- paste("rep", 1:ncol(sets1), sep = "")

#Checking missing values
yNA <- phenoGE
yNA[,3][sets2[,1] == 1] <- NA
tmp <- acast(data = yNA, gid ~ env, value.var = "MPS")
head(tmp, 10)


CV <- list(sets1, sets2)
names(CV) <- c("CV1", "CV2")


validation <- names(CV)
# Cross validation 1 (CV1) - Some genotypes were not observed in any environment
# Cross validation 2 (CV2) - Genotypes were tested in some environments but not in other
kernels <- c("GK", "GB")
# GK = gaussian Kernell
# GB = VanRaden (2008)
models <- c("MM", "MDs", "MDe")
# Main genotypic effect model (MM) - #It assumes that genetic effects across environments are constant
# Single variance genotype × environment deviation model (MDs) -  Incorporates a random deviation effect of GE interaction
# Environment-specific variance deviation model (MDe) - Genetic effect splitted into two components: the common effect for all environments (across environment effects) and a random deviation effect for each environment (environment-specific effects)
rep <- colnames(sets1)


# loading foreach and doFuture
library(foreach)
library(doFuture)
registerDoFuture()
plan(multiprocess)
availableCores()

system.time(
results <- foreach(v = validation, .combine = "rbind") %:% foreach(i = rep, .combine = "rbind") %:% foreach(k = kernels, .combine = "rbind") %:% foreach(m = models, .combine = "rbind") %:% foreach(e = env, .combine = "rbind") %dopar% {

  yNA <- phenoGE
  
  G <- getK(Y = yNA, X = M, kernel = k, model = m)
  
  ne <- as.vector(table(yNA$env))
  
  yNA[, 3][CV[[v]][, i] == 1] <- NA
  
  fit <- BGGE(y = yNA[, 3], K = G, ne = ne, ite = 1000, burn = 100, thin = 5, verbose = FALSE)
  
  fm <- fit$yHat

output <- data.frame(
  cross = v,
  rep = i,
  kernels = k,
  models = m,
  env = e,
  Ac = cor(phenoGE[CV[[v]][, i] == 1 & phenoGE$env == e ,3], fm[CV[[v]][, i] == 1 & phenoGE$env == e])
)

return(output)

}
)

results
str(results)

###############################################
# Lets's compare the accuracy across models, environments, and CV methods
###############################################

library(ggplot2)
library(reshape2)

# CV vs Model
r1 <- melt(tapply(results$Ac, list(results$cross, results$models), mean))
colnames(r1) <- c("CV", "Model", "Accuracy")
r2 <- melt(tapply(results$Ac, list(results$cross, results$models), sd))
colnames(r2) <- c("CV", "Model", "SD")

r <- merge(r1, r2)

limits <- aes(ymax = r$Accuracy + r$SD,
              ymin = r$Accuracy - r$SD)

p1 <- ggplot(data = r, aes(x = factor(CV), y = Accuracy,
                                fill = factor(Model)))
p1 + geom_bar(stat = "identity",
               position = position_dodge(0.9)) + 
  
  geom_errorbar(limits, position = position_dodge(0.9),
                width = 0.25) +
  
  labs(x = "Cross-validation", y = "Accuracy") +
  scale_fill_discrete(name = "Model")


# Kernell vs model
r1 <- melt(tapply(results$Ac, list(results$kernels, results$models), mean))
colnames(r1) <- c("Kernell", "Model", "Accuracy")
r2 <- melt(tapply(results$Ac, list(results$kernels, results$models), sd))
colnames(r2) <- c("Kernell", "Model", "SD")

r <- merge(r1, r2)

limits <- aes(ymax = r$Accuracy + r$SD,
              ymin = r$Accuracy - r$SD)

p1 <- ggplot(data = r, aes(x = factor(Kernell), y = Accuracy,
                           fill = factor(Model)))
p1 + geom_bar(stat = "identity",
              position = position_dodge(0.9)) + 
  
  geom_errorbar(limits, position = position_dodge(0.9),
                width = 0.25) +
  
  labs(x = "Kernell", y = "Accuracy") +
  scale_fill_discrete(name = "Model")


# Model vs Env
r1 <- melt(tapply(results$Ac, list(results$env, results$models), mean))
colnames(r1) <- c("Env", "Model", "Accuracy")
r2 <- melt(tapply(results$Ac, list(results$env, results$models), sd))
colnames(r2) <- c("Env", "Model", "SD")

r <- merge(r1, r2)

limits <- aes(ymax = r$Accuracy + r$SD,
              ymin = r$Accuracy - r$SD)

p1 <- ggplot(data = r, aes(x = factor(Env), y = Accuracy,
                           fill = factor(Model)))
p1 + geom_bar(stat = "identity",
              position = position_dodge(0.9)) + 
  
  geom_errorbar(limits, position = position_dodge(0.9),
                width = 0.25) +
  
  labs(x = "Env", y = "Accuracy") +
  scale_fill_discrete(name = "Model")