#####################
# GS using GE models
#####################
# Loading the phenotypes
pheno <- readRDS("pheno2")
head(pheno)
dim(pheno)

# removing the parents
pheno <- droplevels.data.frame(pheno[pheno$type == "sc",])
head(pheno)
tail(pheno)
dim(pheno)

# adjusting models per enviroment
library(breedR)

LN <- remlf90(fixed = MPSadj2 ~ gid, random = ~rep, method = "em", data = pheno[pheno$N == "low",])

IN <- remlf90(fixed = MPSadj2 ~ gid, random = ~rep, method = "em", data = pheno[pheno$N == "ideal",])

# joint the data by env
phenoGE <- data.frame(env = c(rep("LN", dim(LN$fixed[[1]][[1]])[1]), rep("IN", dim(IN$fixed[[1]][[1]])[1])), gid = c(rownames(LN$fixed[[1]][[1]]), rownames(IN$fixed[[1]][[1]])), MPS = c(LN$fixed[[1]][[1]][,1], IN$fixed[[1]][[1]][,1]))

head(phenoGE)                      
tail(phenoGE)
str(phenoGE)                      

# loading the marker file
M <- readRDS("M")
dim(M)
# selecting just the hybrids
match(levels(pheno$gid), rownames(M))
M <- M[match(levels(pheno$gid), rownames(M)),]
dim(M)
head(M[,1:6])
tail(M[,1:6])

all(phenoGE$gid %in% rownames(M))
all(rownames(M) %in% phenoGE$gid)


#######################################
#===== Multi-Environment models ======#
#######################################
# Packages installation
# install.packages("devtools")
# devtools::install_github('italo-granato/BGGE')
# install.packages("MCMCpack")
# devtools::install_github('QuantGen/MTM')

library(BGGE)
args(getK)
# getK - function to create kernels for different genomic GE models


##################################
# Main genotypic effect model (MM)
##################################
#It assumes that genetic effects across environments are constant

K.MMGB <- getK(Y = phenoGE, X = M, kernel = "GB", model = "MM")
str(K.MMGB)

# Prediction
# BGGE - function that makes predictions
args(BGGE)

(ne <- as.vector(table(phenoGE$env))) # number of genotypes by environmens

# running using the whole data
fitMMGB <- BGGE(y = phenoGE$MPS, K = K.MMGB, ne = ne, ite = 500, burn = 200, thin = 5)
names(fitMMGB)

# Cross validation 1 (CV1) - Some genotypes were not observed in any environment

# Simulating missing values
IDs <- levels(phenoGE$gid) #unique genotypes
IDy <- as.vector(phenoGE$gid) #vector of genotypes
nFolds <- 5 # n of groups
partitions <- 1
env <- as.vector(unique(phenoGE$env))

mfold <- matrix(NA, ncol = partitions, nrow = length(IDs))
sets1 <- matrix(NA, nrow = nrow(phenoGE), ncol = partitions)

set.seed(29121983)
for (s in 1:partitions) {
  mfold[, s] <- sample(1:nFolds, size = length(IDs), replace = TRUE)
  for (i in 1:nrow(phenoGE)) {
    sets1[i, s] <- mfold[which(IDs == phenoGE$gid[i]), s]
  }
}

# Cheking missing for genotypes
yNA <- phenoGE
yNA$MPS[sets1[,1] == 1] <- NA

library(reshape2)
tmp <- acast(data = yNA, gid ~ env, value.var = "MPS") #transforming to env in columns
head(tmp, 10) # showing the 10 first

rggMM1 <- matrix(0, length(env), nFolds) # Matrix of results NULL
for(i in 1:nFolds){
  yNA <- phenoGE
  yNA$MPS[sets1[,1] == i] <- NA
  
  fitMMGB1 <- BGGE(y = yNA$MPS, K = K.MMGB, ne = ne, ite = 500, burn = 200, thin = 5, verbose = FALSE)
  
  fm <- fitMMGB1$yHat
  
  for(g in 1:length(env)){
    tst1 <- sets1[,1] == i & phenoGE$env == env[g]
    rggMM1[g, i] <- cor(phenoGE$MPS[tst1], fm[tst1])
  }
}

rggMM1
rowMeans(rggMM1)

# Cross validation 2 (CV2) - Genotypes were tested in some environments but not in other

# Simulating missing values
sets2 <- matrix(NA, nrow = nrow(phenoGE), ncol = partitions)

set.seed(29121983)
for (s in 1:partitions) {
  for (i in IDs) {
    tmp = which(IDy == i)
    ni = length(tmp)
    tmpFold <- sample(1:nFolds, size = ni, replace = ni * nFolds)
    sets2[tmp, s] <- tmpFold
  }
}

#Checking missing values
yNA <- phenoGE
yNA$MPS[sets2[,1] == 1] <- NA
tmp <- acast(data = yNA, gid ~ env, value.var = "MPS")
head(tmp, 10)

# Prediction
rggMM2 <- matrix(0, length(env), nFolds)
for(i in 1:nFolds){
  yNA <- phenoGE
  yNA$MPS[sets2[,1] == i] <- NA
  
  fitMMGB2 <- BGGE(y = yNA$MPS, K = K.MMGB, ne = ne, ite = 500, burn = 200, thin = 5)
  
  fm <- fitMMGB2$yHat
  
  for(g in 1:length(env)){
    tst1 <- sets2[,1] == i & phenoGE$env == env[g]
    rggMM2[g, i] <- cor(phenoGE$MPS[tst1], fm[tst1])
  }
}

rggMM2
rowMeans(rggMM2)


##############################################################
# Single variance genotype × environment deviation model (MDs)
##############################################################
# Incorporates a random deviation effect of GE interaction

K.MDsGB <- getK(Y = phenoGE, X = M, kernel = "GB", model = "MDs")
str(K.MDsGB)

(ne <- as.vector(table(phenoGE$env))) # number of genotypes by environmens

# Cross validation 1 (CV1) - Some genotypes were not observed in any environment

# Simulating missing values
IDs <- levels(phenoGE$gid) #unique genotypes
IDy <- as.vector(phenoGE$gid) #vector of genotypes
nFolds <- 5 # n of groups
partitions <- 1
env <- as.vector(unique(phenoGE$env))

mfold <- matrix(NA, ncol = partitions, nrow = length(IDs))
sets1 <- matrix(NA, nrow = nrow(phenoGE), ncol = partitions)

set.seed(29121983)
for (s in 1:partitions) {
  mfold[, s] <- sample(1:nFolds, size = length(IDs), replace = TRUE)
  for (i in 1:nrow(phenoGE)) {
    sets1[i, s] <- mfold[which(IDs == phenoGE$gid[i]), s]
  }
}

# Cheking missing for genotypes
yNA <- phenoGE
yNA$MPS[sets1[,1] == 1] <- NA

library(reshape2)
tmp <- acast(data = yNA, gid ~ env, value.var = "MPS") #transforming to env in columns
head(tmp, 10) # showing the 10 first

rggMDs1 <- matrix(0, length(env), nFolds) # Matrix of results NULL
for(i in 1:nFolds){
  yNA <- phenoGE
  yNA$MPS[sets1[,1] == i] <- NA
  
  fitMDsGB1 <- BGGE(y = yNA$MPS, K = K.MDsGB, ne = ne, ite = 500, burn = 200, thin = 5, verbose = FALSE)
  
  fm <- fitMDsGB1$yHat
  
  for(g in 1:length(env)){
    tst1 <- sets1[,1] == i & phenoGE$env == env[g]
    rggMDs1[g, i] <- cor(phenoGE$MPS[tst1], fm[tst1])
  }
}

rggMDs1
rowMeans(rggMDs1)

# Cross validation 2 (CV2) - Genotypes were tested in some environments but not in other

# Simulating missing values
sets2 <- matrix(NA, nrow = nrow(phenoGE), ncol = partitions)

set.seed(29121983)
for (s in 1:partitions) {
  for (i in IDs) {
    tmp = which(IDy == i)
    ni = length(tmp)
    tmpFold <- sample(1:nFolds, size = ni, replace = ni * nFolds)
    sets2[tmp, s] <- tmpFold
  }
}

#Checking missing values
yNA <- phenoGE
yNA$MPS[sets2[,1] == 1] <- NA
tmp <- acast(data = yNA, gid ~ env, value.var = "MPS")
head(tmp, 10)

# Prediction
rggMDs2 <- matrix(0, length(env), nFolds)
for(i in 1:nFolds){
  yNA <- phenoGE
  yNA$MPS[sets2[,1] == i] <- NA
  
  fitMDsGB2 <- BGGE(y = yNA$MPS, K = K.MDsGB, ne = ne, ite = 500, burn = 200, thin = 5)
  
  fm <- fitMDsGB2$yHat
  
  for(g in 1:length(env)){
    tst1 <- sets2[,1] == i & phenoGE$env == env[g]
    rggMDs2[g, i] <- cor(phenoGE$MPS[tst1], fm[tst1])
  }
}

rggMDs2
rowMeans(rggMDs2)


#######################################################
# Environment-specific variance deviation model (MDe)
#######################################################
#Genetic effect splitted into two components: 
#the common effect for all environments (across environment effects) and a random deviation effect for each environment (environment-specific effects)

K.MDeGB <- getK(Y = phenoGE, X = M, kernel = "GB", model = "MDe")
str(K.MDeGB)

(ne <- as.vector(table(phenoGE$env))) # number of genotypes by environmens


# Simulating missing values
IDs <- levels(phenoGE$gid) #unique genotypes
IDy <- as.vector(phenoGE$gid) #vector of genotypes
nFolds <- 5 # n of groups
partitions <- 1
env <- as.vector(unique(phenoGE$env))

mfold <- matrix(NA, ncol = partitions, nrow = length(IDs))
sets1 <- matrix(NA, nrow = nrow(phenoGE), ncol = partitions)

set.seed(29121983)
for (s in 1:partitions) {
  mfold[, s] <- sample(1:nFolds, size = length(IDs), replace = TRUE)
  for (i in 1:nrow(phenoGE)) {
    sets1[i, s] <- mfold[which(IDs == phenoGE$gid[i]), s]
  }
}

# Cheking missing for genotypes
yNA <- phenoGE
yNA$MPS[sets1[,1] == 1] <- NA

library(reshape2)
tmp <- acast(data = yNA, gid ~ env, value.var = "MPS") #transforming to env in columns
head(tmp, 10) # showing the 10 first

rggMDe1 <- matrix(0, length(env), nFolds) # Matrix of results NULL
for(i in 1:nFolds){
  yNA <- phenoGE
  yNA$MPS[sets1[,1] == i] <- NA
  
  fitMDeGB1 <- BGGE(y = yNA$MPS, K = K.MDeGB, ne = ne, ite = 500, burn = 200, thin = 5, verbose = FALSE)
  
  fm <- fitMDeGB1$yHat
  
  for(g in 1:length(env)){
    tst1 <- sets1[,1] == i & phenoGE$env == env[g]
    rggMDe1[g, i] <- cor(phenoGE$MPS[tst1], fm[tst1])
  }
}

rggMDe1
rowMeans(rggMDe1)

# Cross validation 2 (CV2) - Genotypes were tested in some environments but not in other

# Simulating missing values
sets2 <- matrix(NA, nrow = nrow(phenoGE), ncol = partitions)

set.seed(29121983)
for (s in 1:partitions) {
  for (i in IDs) {
    tmp = which(IDy == i)
    ni = length(tmp)
    tmpFold <- sample(1:nFolds, size = ni, replace = ni * nFolds)
    sets2[tmp, s] <- tmpFold
  }
}

#Checking missing values
yNA <- phenoGE
yNA$MPS[sets2[,1] == 1] <- NA
tmp <- acast(data = yNA, gid ~ env, value.var = "MPS")
head(tmp, 10)

# Prediction
rggMDe2 <- matrix(0, length(env), nFolds)
for(i in 1:nFolds){
  yNA <- phenoGE
  yNA$MPS[sets2[,1] == i] <- NA
  
  fitMDeGB2 <- BGGE(y = yNA$MPS, K = K.MDeGB, ne = ne, ite = 500, burn = 200, thin = 5)
  
  fm <- fitMDeGB2$yHat
  
  for(g in 1:length(env)){
    tst1 <- sets2[,1] == i & phenoGE$env == env[g]
    rggMDe2[g, i] <- cor(phenoGE$MPS[tst1], fm[tst1])
  }
}

rggMDe2
rowMeans(rggMDe2)


##############################################
# comparing the GE, environments, and cross-validation models
###############################################

output <- data.frame(model = rep(c("MM", "MDs", "MDe"), each = 2), env = rep(env, 3), CV = rep(c("CV1", "CV2"),3), Ac = c(rowMeans(rggMM1), rowMeans(rggMM2), rowMeans(rggMDs1), rowMeans(rggMDs2), rowMeans(rggMDe1), rowMeans(rggMDe2)))

par(mfrow = c(1,3))
barplot(tapply(output$Ac, output$CV, mean), col = c("blue", "lightblue"), cex.names = 0.75, main = "Cross-validation", ylim = c(0, 0.6))
barplot(tapply(output$Ac, output$model, mean), col = c("darkred", "red", "orange"), cex.names = 0.75, main = "GE model", ylim = c(0, 0.6))
barplot(tapply(output$Ac, output$env, mean), col = c("darkgreen", "lightgreen"), cex.names = 0.75, main = "N-level", ylim = c(0, 0.6))

##############################################