################################# # GS using mixed models equations ################################# # loading data Y <- readRDS("phenoGS") head(Y) str(Y) #identifying the five best hybrids (top5 <- droplevels(Y[order(Y[,"BLUE"], decreasing = TRUE)[1:5],"gid"])) # loading the marker file M <- readRDS("M") dim(M) # selecting just the hybrids match(Y$gid, rownames(M)) M <- M[match(Y$gid, rownames(M)),] dim(M) head(M[,1:6]) tail(M[,1:6]) all(Y$gid == rownames(M)) # building the Ga matrix #source("https://bioconductor.org/biocLite.R") #biocLite("impute") #devtools::install_github(repo = 'italo-granato/snpReady', ref = 'dev') library(snpReady) Ga <- G.matrix(M, method = "VanRaden", format = "wide", plot = F)$Ga colnames(Ga) <- rownames(Ga) <- Y$gid dim(Ga) Ga[1:6,1:6] # creating the Za for additive effects Za <- model.matrix(~ -1 + gid, data = Y) colnames(Za) <- colnames(Ga) dim(Za) Za[1:6, 1:6] # building the Gd matrix Gd <- G.matrix(M, method = "VanRaden", format = "wide", plot = F)$Gd colnames(Gd) <- rownames(Gd) <- Y$gid dim(Gd) Gd[1:6,1:6] # creating the Zd for additive effects Zd <- model.matrix(~ -1 + gid, data = Y) colnames(Zd) <- colnames(Gd) dim(Zd) Zd[1:6, 1:6] #################### # GS using A model #################### #install.packages('devtools') #devtools::install_github('famuvie/breedR') library(breedR) nfold <- 5 # trainnig and validation sizes replicates <- 5 # repeat this procedure 5 times n <- dim(Y)[1] # number of individuals set.seed(29121983) # to obtain the same sampling output <- data.frame() # create an empty file to storage results GEBV <- data.frame() GEBV_check <- data.frame() for (r in 1:replicates) { cat("Processing the replicate", r, "\n") ## test sets sets <- sample(rep(1:nfold, length.out = n)) # different orders of the same vector for (k in 1:nfold) { cat("Processing the fold", k, "\n") itest = which(sets == k) itrain = which(sets != k) YGS <- Y YGS[itest, "BLUE"] <- NA # it creates the validation set sol <- remlf90(fixed = BLUE ~ 1, generic = list(A = list(Za, Ga)), data = YGS, method = "em") ## predicted values for the test set blups <- sol$ranef[[1]][[1]] rownames(blups) <- colnames(Ga) top5pred <- rownames(blups[order(blups[,1], decreasing = TRUE)[1:5],1:2]) predicted.blups <- as.matrix(blups[itest,]) observed.blups <- Y[itest, "BLUE"] output <- rbind(output, data.frame( rep = r, fold = k, PA = cor(predicted.blups[,1], observed.blups), CS = sum(top5pred %in% top5)/length(top5), hm = sol$var[1,1]/sum(sol$var[,1]), LL = logLik(sol)[1], AIC = (-2*logLik(sol) + 2*dim(sol$var)[1])[1]) ) GEBV <- rbind(GEBV, data.frame( gid = rownames(blups[itest,]), gebv = predicted.blups[,1])) GEBV_check <- rbind(GEBV_check, data.frame( gid = rownames(blups), gebv = blups[,1])) } } output # estimate the mean (result <- apply(output, 2, mean)) # standard deviation (sd.PA <- round(sd(tapply(output$PA, output$rep, mean)), 2)) dim(output) dim(GEBV) head(GEBV) # lets organize the breeding values and pick the mean GEBV2 <- as.matrix(tapply(GEBV_check$gebv, GEBV_check$gid, mean)) head(GEBV2) GEBV2 <- GEBV2[(match(rownames(M), rownames(GEBV2))),] GEBV3 <- as.matrix(GEBV2) colnames(GEBV3) <- "GEBV" head(GEBV3) dim(GEBV3) ################### ## markers effect and RR-BLUP ################### # fuction to estimate the variance vpq <- function(x){ p <- sum(x)/(length(x)*2) pq <- p * (1 - p) return(pq) } vg <- 2*sum(apply(M, 2, vpq)) # and finally the marker effects a <- t(M) %*% solve(Ga) %*% GEBV3 / vg all(rownames(a) == colnames(M)) colnames(a) <- "marker effect" head(a) dim(a) # correlation betwwen GBLUP and RR-BLUP rrblup.gbv <- M %*% a colnames(rrblup.gbv) <- "RR-BLUP" head(rrblup.gbv) (pho <- round(cor(GEBV3, rrblup.gbv),2)) joint <- data.frame(gid = Y$gid, GBLUP = GEBV3, RRBLUP = rrblup.gbv) head(joint) library(plotly) (p <- plot_ly(joint, x = ~GEBV, y = ~RR.BLUP, type = 'scatter', mode = 'markers', text = ~paste('gid: ', gid))) #################### # GS using A + D model #################### #install.packages('devtools') #devtools::install_github('famuvie/breedR') # library(breedR) nfold <- 5 # trainnig and validation sizes replicates <- 5 # repeat this procedure 5 times n <- dim(Y)[1] # number of individuals set.seed(29121983) # to obtain the same sampling output <- data.frame() # create an empty file to storage results GEBV <- data.frame() for (r in 1:replicates) { cat("Processing the replicate", r, "\n") ## test sets sets <- sample(rep(1:nfold, length.out = n)) # different orders of the same vector for (k in 1:nfold) { cat("Processing the fold", k, "\n") itest = which(sets == k) itrain = which(sets != k) YGS <- Y YGS[itest, "BLUE"] <- NA # it creates the validation set sol <- remlf90(fixed = BLUE ~ 1, generic = list(A = list(Za, Ga), D = list(Zd, precision = Gd)), data = YGS, method = "em") ## predicted values for the test set blupsa <- sol$ranef[[1]][[1]] blupsd <- sol$ranef[[2]][[1]] blups <- blupsa + blupsd rownames(blups) <- colnames(Ga) top5pred <- rownames(blups[order(blups[,1], decreasing = TRUE)[1:5],1:2]) predicted.blups <- as.matrix(blups[itest,]) observed.blups <- Y[itest, "BLUE"] output <- rbind(output, data.frame( rep = r, fold = k, PA = cor(predicted.blups[,1], observed.blups), CS = sum(top5pred %in% top5)/length(top5), hm = sol$var[1,1]/sum(sol$var[,1]), LL = logLik(sol)[1], AIC = (-2*logLik(sol) + 2*dim(sol$var)[1])[1]) ) GEBV = rbind(GEBV, data.frame( gid = rownames(blups[itest,]), gebv = predicted.blups[,1])) } } sol$var output # estimate the mean (result <- apply(output, 2, mean)) # standard deviation (sd.PA <- round(sd(tapply(output$PA, output$rep, mean)), 2)) ############### # GS using OTS, BLUPS, and weights ############### #install.packages('devtools') #devtools::install_github('famuvie/breedR') library(breedR) OTS <- readRDS("OTS") names(OTS) output2 <- data.frame() for (i in 1:length(OTS)) { cat("Processing OTS", i, "\n") itrain <- OTS[[i]] itest <- Y$gid[!(Y$gid) %in% itrain] YGS <- Y YGS[Y$gid %in% itest, c("dBLUP")] <- NA sol <- remlf90(fixed = dBLUP ~ 1, generic = list(A = list(Za, Ga)), data = YGS, method = "em", weights = YGS[, "w"]) ## predicted values for the test set blups <- sol$ranef[[1]][[1]] rownames(blups) <- colnames(Ga) blups$gid <- rownames(blups) top5pred <- rownames(blups[order(blups[,1], decreasing = TRUE)[1:5],1:2]) predicted.blups <- blups[blups$gid %in% itest,] observed.blups <- Y[itest, "BLUE"] output2 <- rbind(output2, data.frame( OTS = names(OTS)[i], PA = cor(predicted.blups[,1], observed.blups), CS = sum(top5pred %in% top5)/length(top5), hm = sol$var[1,1]/sum(sol$var[,1]), LL = logLik(sol)[1], AIC = (-2*logLik(sol) + 2*dim(sol$var)[1])[1]) ) } library(knitr) kable(output2) library(plotly) (p2 <- plot_ly(output2, x = ~OTS, y = ~ PA, type = "bar")) ######################################### # GS multi-trait + Ga + new way to fold ######################################### # loading data pheno <- readRDS("pheno") head(pheno) tail(pheno) pheno <- droplevels.data.frame(pheno[pheno$type == "sc",]) # correlation among traits round(cor(pheno[,7:9]),2) library("PerformanceAnalytics") chart.Correlation(as.matrix(pheno[,7:9]), histogram = TRUE, pch = 19) # selecting just MPS and NAE, and adjusting for normality require(bestNormalize) pheno$MPSadj <- bestNormalize(pheno$MPS, standardize = FALSE, allow_orderNorm = TRUE, out_of_sample = FALSE)$x.t pheno$NAEadj <- bestNormalize(pheno$NAE, standardize = FALSE, allow_orderNorm = TRUE, out_of_sample = FALSE)$x.t head(pheno) # estimating the BLUES library(breedR) MPS <- remlf90(fixed = MPSadj ~ N + gid, random = ~ rep, method = "em", data = pheno) NAE <- remlf90(fixed = NAEadj ~ N + gid, random = ~ rep, method = "em", data = pheno) Y <- data.frame(gid = rownames(MPS$fixed$gid[[1]]), MPS = MPS$fixed$gid[[1]], NAE = NAE$fixed$gid[[1]])[,c(1,2,4)] colnames(Y)[2:3] <- c("MPS", "NAE") head(Y) # covariance matrices covg <- matrix(c(1, cov(Y[,2], Y[,3]),cov(Y[,2], Y[,3]),1), 2, 2) colnames(covg) <- rownames(covg) <- colnames(Y)[2:3] covg covr <- matrix(c(1, 0, 0, 1), 2, 2) colnames(covr) <- rownames(covr) <- colnames(Y)[2:3] covr # a new way (and best) to fold (fivefold with four replicates) fivefoldCV <- function(gid, seed = 29121983){ require(agricolae) gid = as.character(gid) replicates <- 4 folds <- 5 lb <- length(gid) s <- lb / folds odds <- seq(from = 1, to = s, by = 2) dvby3 <- (s / 3) - trunc(s / 3) while (!s %in% odds | dvby3 == 0) { lb <- (lb - 1) s <- lb / folds # must be odd and not divisible by three dvby3 <- (s / 3) - trunc(s / 3) # Thus, it must be != 0 } rest <- length(gid) - lb if (rest == 0) { book <- design.alpha(trt = gid, r = replicates, s = s, k = folds, seed = seed)$book book <- data.frame(replicate = book[,5], fold = book[,2], gid = book[,4]) rownames(book) <- NULL return(book) } if (rest > 0) { book <- design.alpha(trt = gid[1:lb], r = replicates, s = s, k = folds, seed = seed)$book book <- data.frame(replicate = book[,5], fold = book[,2], gid = book[,4]) book2 <- data.frame(replicate = rep(1:replicates, each = rest), fold = c(sample(1:folds, rest, replace = TRUE), sample(1:folds, rest, replace = TRUE), sample(1:folds, rest, replace = TRUE), sample(1:folds, rest, replace = TRUE)), gid = c(gid[(lb + 1):length(gid)], gid[(lb + 1):length(gid)], gid[(lb + 1):length(gid)], gid[(lb + 1):length(gid)])) book3 <- rbind(book, book2) rownames(book3) <- NULL non.ortho <- rest/length(gid)*100 cat(rest, "individual were not assigned in the main scheme, that represent", non.ortho, "%") output <- list(CV = book3, size = length(gid), rest = rest, non.ortho = non.ortho) return(output) } } library(agricolae) crossvalidation <- fivefoldCV(Y$gid) head(crossvalidation[[1]], 12) (replicates <- length(unique(crossvalidation$CV$replicate))) (nfold <- length(unique(crossvalidation$CV$fold))) output <- data.frame() GEBV <- data.frame() for (r in 1:replicates) { cat("Processing the replicate", r, "\n") for (k in 1:nfold) { cat("Processing the fold", k, "\n") itest <- as.character(crossvalidation$CV[crossvalidation$CV$replicate == r & crossvalidation$CV$fold == k,]$gid) gid <- as.character(crossvalidation$CV[crossvalidation$CV$replicate == r, "gid"]) itrain <- gid[!gid %in% itest] YGS <- Y YGS[itest, c(2, 3)] <- NA sol <- remlf90(fixed = cbind(YGS[,2], YGS[,3]) ~ 1, generic = list(A = list(Za, Ga, var.ini = covg)), data = YGS, method = "em", var.ini = list(covr)) ## predicted values for the test set blupsA <- as.matrix(sol$ranef[[1]][[1]]) rownames(blupsA) <- colnames(Ga) predicted.blupsA <- as.matrix(blupsA[itest,]) observed.blupsA <- Y[itest, 2] blupsB <- as.matrix(sol$ranef[[1]][[2]]) rownames(blupsB) <- colnames(Ga) predicted.blupsB <- as.matrix(blupsB[itest,]) observed.blupsB <- Y[itest, 3] output <- rbind(output, data.frame( rep = r, fold = k, PA.A = cor(predicted.blupsA[,1], observed.blupsA), PA.B = cor(predicted.blupsB[,1], observed.blupsB) ) ) GEBV = rbind(GEBV, data.frame( gid = rownames(blupsB[itest,]), gebvA = predicted.blupsA[,1], gebvB = predicted.blupsB[,1])) } } output # mean of predicting ability (result <- round(apply(output[,-c(1:2)], 2, mean), 2)) # standard deviations for A and B (sd.PA.A <- round(sd(tapply(output$PA.A, output$rep, mean)), 2)) (sd.PA.B <- round(sd(tapply(output$PA.B, output$rep, mean)), 2)) # Estimating the average of the GEBV dim(GEBV) head(GEBV) GEBV2 <- cbind(tapply(GEBV$gebvA, GEBV$gid, mean), tapply(GEBV$gebvB, GEBV$gid, mean)) head(GEBV2) GEBV2 <- GEBV2[(match(rownames(M), rownames(GEBV2))),] GEBV3 <- as.matrix(GEBV2) colnames(GEBV3) <- colnames(Y)[2:3] head(GEBV3) # the new correlation between the traits cor(GEBV3) chart.Correlation(as.matrix(GEBV3), histogram = TRUE, pch = 19) ######################################## # components of variance and parameters sol$var (hA <- sol$var$A[1,1] / sum(sol$var$A[1,1], sol$var$Residual[1,1])) (hB <- sol$var$A[2,2] / sum(sol$var$A[2,2], sol$var$Residual[2,2])) (corAB <- sol$var$A[1,2] / sqrt(sol$var$A[1,1]*sol$var$A[2,2])) # example of one replicate head(YGS, 20)