########################################################################### # GS by Bayes models ##################################### # loading data Y <- readRDS("phenoGS") head(Y) str(Y) #identifying the five best hybrids (top5 <- droplevels(Y[order(Y[,"BLUE"], decreasing = TRUE)[1:5],"gid"])) # loading the marker file M <- readRDS("M") dim(M) # selecting just the hybrids match(Y$gid, rownames(M)) M <- M[match(Y$gid, rownames(M)),] dim(M) head(M[,1:6]) tail(M[,1:6]) all(Y$gid == rownames(M)) ## supply necessary libraries library(BGLR) # Bayesian Generalized Linear Regression library(reshape2) # cast() & melt() library(plyr) # extended apply family library(ggplot2) # gramar graphics options(digits = 4) # printing option! sessionInfo() # useful infos **Reproducible Research** # centering the SNP matrix snpc <- scale(M, center = TRUE, scale = TRUE) ## genomic relationship matrix G <- tcrossprod(snpc) / ncol(snpc) ## linear predictor for G blup eta <- list(list(K = G, model = 'RKHS')) ## fitting the model gblup <- BGLR(y = Y[,2], ETA = eta, nIter = 5000, burnIn = 1000, thin = 5, saveAt = 'gblup_', verbose = T) ## predictions yHat <- gblup$yHat yHat # predicted vs observed p <- ggplot(mapping = aes(x = Y[,2], y = gblup$yHat)) + geom_point() + geom_smooth(method = 'lm', se = TRUE) library(plotly) ggplotly(p) ## godness of fit and related statistics gblup$fit gblup$varE # residual variance var(Y[,2]) # phnotypic variance # genomic heritability (h <- var(Y[,2]) - gblup$varE) / var(Y[,2]) ## variance components gblup$ETA[[1]]$varU gblup$ETA[[1]]$varU / var(Y[,2]) ## markov chains list.files() ## residual variance s2e <- scan('gblup_varE.dat') # all the gibbs resamplings within the chain but using the thin = 5 ## posterior distribution - densidade posteriori ggplot(mapping = aes(x = s2e)) + geom_density() ## trace ggplot(mapping = aes(x = 1:length(s2e), y = s2e)) + geom_line() ## 'genomic' variance s2g <- scan('gblup_ETA_1_varU.dat') ## posterior distribution - densidade posteriori ggplot(mapping = aes(x = s2g)) + geom_density() ## trace ggplot(mapping = aes(x = 1:length(s2g), y = s2g)) + geom_line() ########################################################################### ## cross validation by bootstrap ## Creating a Testing set ## do it equal all the times set.seed(29121983) ## dimensions n <- nrow(snpc) # number of genotypes p <- ncol(snpc) # number of snps f <- .2 # 20% will used for testing s <- f * n # number of testing genotypes ## copy phenotypes to avoid confusion Y_copy <- Y[,2] ## which individual goes to each set tst <- sample(1:n, size = s, replace = FALSE) tst ## replace values as missing Y_copy[tst] <- NA head(Y_copy, 10) ## fiting the model again... but, now w/ trt & tst sets gblup2 <- BGLR(y = Y_copy, ETA = eta, nIter = 5000, burnIn = 1000, thin = 5, saveAt = 'gblup2_', verbose = FALSE) q <- ggplot(mapping = aes(x = Y[,2], y = gblup2$yHat)) + geom_point(colour = factor(ifelse(is.na(Y_copy), 2, 1))) + geom_smooth(method = 'lm', se = TRUE) ggplotly(q) ## correlation in 'training' and 'testing' sets cor(gblup2$yHat[ tst], Y[,2][ tst]) # tst cor(gblup2$yHat[-tst], Y[,2][-tst]) # trt ########################################################################### ## repeating the validation process ## do cross validation process by bootstrap # r number of runs # y phenotype # G Kinship # f = proportion for test dcv <- function(r, y, G, f) { s <- length(y) * f ycopy <- y tst <- sample(1:length(y), size = s, replace = FALSE) ycopy[tst] <- NA eta <- list(list(K = G, model = 'RKHS')) fit <- BGLR(y = Y_copy, ETA = eta, nIter = 1E4, burnIn = 2E3, thin = 5, saveAt = 'gblup2_', verbose = FALSE) rhotst <- cor(fit$yHat[ tst], y[ tst]) rhotrt <- cor(fit$yHat[-tst], y[-tst]) return(c(tst = rhotst, trt = rhotrt)) } ## run several rounds of cross validation cvs <- adply(1:50, 1, dcv, y = Y[,2], G = G, f = .2) ## overal trend in cross validation head(cvs) apply(cvs[,-1], 2, range) boxplot(cvs$tst, col = "red") ###################### ## other priors ##################### ##### Using foreach to test diferent models # define factor and their levels models <- c('BRR', 'BL', 'BayesA', 'BayesB', "BayesC") # loading foreach and doFuture library(foreach) library(doFuture) registerDoFuture() plan(multiprocess) availableCores() results <- foreach(i = models, .combine = "c") %dopar% { # estimating different BV sol <- BGLR(y = Y[,2], ETA = list(snps = list(X = M, model = i)), nIter = 10000, burnIn = 2000, thin = 5, verbose = FALSE) output <- list( BV = sol$yHat) return(output) } results <- data.frame(results) colnames(results) <- models head(results) # correlation among breeding values over different priors round(cor(results), 3) # estimating different SNP effects results2 <- foreach(i = models, .combine = "c") %dopar% { sol <- BGLR(y = Y[,2], ETA = list(snps = list(X = M, model = i)), nIter = 10000, burnIn = 2000, thin = 5, verbose = FALSE) output <- list( snpe = sol$ETA$snps$b) return(output) } results2 <- data.frame(results2) colnames(results2) <- models head(results2) # correlation among SNP effects over different priors round(cor(results2), 3)