##################### # GS using Bayesian GE models ##################### # Loading the phenotypes pheno <- readRDS("pheno2") head(pheno) dim(pheno) # removing the parents pheno <- droplevels.data.frame(pheno[pheno$type == "sc",]) head(pheno) tail(pheno) dim(pheno) # adjusting models per enviroment library(breedR) LN <- remlf90(fixed = MPSadj ~ gid, random = ~ rep, method = "em", data = pheno[pheno$N == "low",]) mean(pheno[pheno$N == "low", "MPSadj"]) IN <- remlf90(fixed = MPSadj ~ gid, random = ~ rep, method = "em", data = pheno[pheno$N == "ideal",]) mean(pheno[pheno$N == "ideal", "MPSadj"]) # joint the data but adding the mean by env phenoGE <- data.frame(env = c(rep("LN", dim(LN$fixed[[1]][[1]])[1]), rep("IN", dim(IN$fixed[[1]][[1]])[1])), gid = c(rownames(LN$fixed[[1]][[1]]), rownames(IN$fixed[[1]][[1]])), MPS = c((LN$fixed[[1]][[1]][,1] + mean(pheno[pheno$N == "low", "MPSadj"])), (IN$fixed[[1]][[1]][,1] + mean(pheno[pheno$N == "ideal", "MPSadj"])))) head(phenoGE) tail(phenoGE) str(phenoGE) # loading the marker file M <- readRDS("M") dim(M) # selecting just the hybrids match(levels(pheno$gid), rownames(M)) M <- M[match(levels(pheno$gid), rownames(M)),] dim(M) head(M[,1:6]) tail(M[,1:6]) all(phenoGE$gid %in% rownames(M)) all(rownames(M) %in% phenoGE$gid) ####################################### #===== Multi-Environment models ======# ####################################### # Packages installation # install.packages("devtools") # devtools::install_github('italo-granato/BGGE') # install.packages("MCMCpack") # devtools::install_github('QuantGen/MTM') library(BGGE) args(getK) # getK - function to create kernels for different genomic GE models ################################## # Main genotypic effect model (MM) ################################## #It assumes that genetic effects across environments are constant ?BGGE K.MMGB <- getK(Y = phenoGE, X = M, kernel = "GB", model = "MM") str(K.MMGB) # Prediction # BGGE - function that makes predictions args(BGGE) (ne <- as.vector(table(phenoGE$env))) # number of genotypes by environmens # running using the whole data fitMMGB <- BGGE(y = phenoGE[,3], K = K.MMGB, ne = ne, ite = 2000, burn = 200, thin = 5) names(fitMMGB) # PCA of kernel dim(K.MMGB$G$Kernel) K.MMGB$G$Kernel[1:5,1:5] eig.result <- eigen(K.MMGB$G$Kernel) lambda <- eig.result$values length(lambda) plot(lambda/sum(lambda), ylab = "Fraction Explained") PCA <- eig.result$vectors dim(PCA) PCA[1:5, 1:5] XF <- cbind(rep(1, nrow(PCA)), PCA[,1]) rownames(XF) <- colnames(K.MMGB$G$Kernel) colnames(XF) <- c("Bo", "PCA1") class(XF) head(XF) # running again but adding a fixed effect fitMMGB <- BGGE(y = phenoGE[,3], XF = XF, K = K.MMGB, ne = ne, ite = 2000, burn = 200, thin = 5) fitMMGB$yHat fitMMGB$varE ####### # definig paremeters to simulate missing values IDs <- levels(phenoGE$gid) #unique genotypes IDy <- as.vector(phenoGE$gid) #vector of genotypes nFolds <- 5 # n of groups replicates <- 10 # boostrap method env <- as.vector(unique(phenoGE$env)) ######################################################################################## # Cross validation 1 (CV1) - Some genotypes were not observed in any environment ######################################################################################## # Simulating missing values mfold <- matrix(NA, ncol = replicates, nrow = length(IDs)) sets1 <- matrix(NA, nrow = nrow(phenoGE), ncol = replicates) set.seed(29121983) for (s in 1:replicates) { mfold[, s] <- sample(rep(seq(1:nFolds), length.out = length(IDs))) for (i in 1:nrow(phenoGE)) { sets1[i, s] <- mfold[which(IDs == phenoGE$gid[i]), s] } } colnames(sets1) <- paste("rep", 1:ncol(sets1), sep = "") sets1 # Cheking missing for genotypes yNA <- phenoGE yNA[,3][sets1[,1] == 1] <- NA library(reshape2) tmp <- acast(data = yNA, gid ~ env, value.var = "MPS") #transforming to env in columns head(tmp, 10) # showing the 10 first ######################################################################################## # Cross validation 2 (CV2) - Genotypes were tested in some environments but not in other ######################################################################################## # Simulating missing values sets2 <- matrix(NA, nrow = nrow(phenoGE), ncol = replicates) set.seed(29121983) for (s in 1:replicates) { sets2[, s] <- sample(rep(seq(1:nFolds), length.out = 2*length(IDs))) } colnames(sets2) <- paste("rep", 1:ncol(sets1), sep = "") #Checking missing values yNA <- phenoGE yNA[,3][sets2[,1] == 1] <- NA tmp <- acast(data = yNA, gid ~ env, value.var = "MPS") head(tmp, 10) CV <- list(sets1, sets2) names(CV) <- c("CV1", "CV2") validation <- names(CV) # Cross validation 1 (CV1) - Some genotypes were not observed in any environment # Cross validation 2 (CV2) - Genotypes were tested in some environments but not in other kernels <- c("GK", "GB") # GK = gaussian Kernell # GB = VanRaden (2008) models <- c("MM", "MDs", "MDe") # Main genotypic effect model (MM) - #It assumes that genetic effects across environments are constant # Single variance genotype × environment deviation model (MDs) - Incorporates a random deviation effect of GE interaction # Environment-specific variance deviation model (MDe) - Genetic effect splitted into two components: the common effect for all environments (across environment effects) and a random deviation effect for each environment (environment-specific effects) rep <- colnames(sets1) # loading foreach and doFuture library(foreach) library(doFuture) registerDoFuture() plan(multiprocess) availableCores() system.time( results <- foreach(v = validation, .combine = "rbind") %:% foreach(i = rep, .combine = "rbind") %:% foreach(k = kernels, .combine = "rbind") %:% foreach(m = models, .combine = "rbind") %:% foreach(e = env, .combine = "rbind") %dopar% { yNA <- phenoGE G <- getK(Y = yNA, X = M, kernel = k, model = m) ne <- as.vector(table(yNA$env)) yNA[, 3][CV[[v]][, i] == 1] <- NA fit <- BGGE(y = yNA[, 3], K = G, ne = ne, ite = 1000, burn = 100, thin = 5, verbose = FALSE) fm <- fit$yHat output <- data.frame( cross = v, rep = i, kernels = k, models = m, env = e, Ac = cor(phenoGE[CV[[v]][, i] == 1 & phenoGE$env == e ,3], fm[CV[[v]][, i] == 1 & phenoGE$env == e]) ) return(output) } ) results str(results) ############################################### # Lets's compare the accuracy across models, environments, and CV methods ############################################### library(ggplot2) library(reshape2) # CV vs Model r1 <- melt(tapply(results$Ac, list(results$cross, results$models), mean)) colnames(r1) <- c("CV", "Model", "Accuracy") r2 <- melt(tapply(results$Ac, list(results$cross, results$models), sd)) colnames(r2) <- c("CV", "Model", "SD") r <- merge(r1, r2) limits <- aes(ymax = r$Accuracy + r$SD, ymin = r$Accuracy - r$SD) p1 <- ggplot(data = r, aes(x = factor(CV), y = Accuracy, fill = factor(Model))) p1 + geom_bar(stat = "identity", position = position_dodge(0.9)) + geom_errorbar(limits, position = position_dodge(0.9), width = 0.25) + labs(x = "Cross-validation", y = "Accuracy") + scale_fill_discrete(name = "Model") # Kernell vs model r1 <- melt(tapply(results$Ac, list(results$kernels, results$models), mean)) colnames(r1) <- c("Kernell", "Model", "Accuracy") r2 <- melt(tapply(results$Ac, list(results$kernels, results$models), sd)) colnames(r2) <- c("Kernell", "Model", "SD") r <- merge(r1, r2) limits <- aes(ymax = r$Accuracy + r$SD, ymin = r$Accuracy - r$SD) p1 <- ggplot(data = r, aes(x = factor(Kernell), y = Accuracy, fill = factor(Model))) p1 + geom_bar(stat = "identity", position = position_dodge(0.9)) + geom_errorbar(limits, position = position_dodge(0.9), width = 0.25) + labs(x = "Kernell", y = "Accuracy") + scale_fill_discrete(name = "Model") # Model vs Env r1 <- melt(tapply(results$Ac, list(results$env, results$models), mean)) colnames(r1) <- c("Env", "Model", "Accuracy") r2 <- melt(tapply(results$Ac, list(results$env, results$models), sd)) colnames(r2) <- c("Env", "Model", "SD") r <- merge(r1, r2) limits <- aes(ymax = r$Accuracy + r$SD, ymin = r$Accuracy - r$SD) p1 <- ggplot(data = r, aes(x = factor(Env), y = Accuracy, fill = factor(Model))) p1 + geom_bar(stat = "identity", position = position_dodge(0.9)) + geom_errorbar(limits, position = position_dodge(0.9), width = 0.25) + labs(x = "Env", y = "Accuracy") + scale_fill_discrete(name = "Model")