########################## # Lab 2 - Population structure based on markers ########################## # loading the files genotypes <- readRDS("M") genotypes[1:14, 1:10] # considering just the parents genotypes <- genotypes[1:14,] dim(genotypes) # this parantes repret two sources of germplasm N and P group <- as.factor(rep(c("P", "N"), each = 7)) group # svd decomposition - by individuals svdG <- svd(genotypes, nu = 14, nv = 216) plot(cumsum((svdG$d[1:20])^2/sum(svdG$d^2)), ylab = "proportion accumulated", xlab = "number of individuals", col = "red") # obtainig the eigenvectors and eigenvalues pcsG <- genotypes %*% svdG$v rownames(pcsG) <- rownames(genotypes) dim(pcsG) pcsG[1:14,1:6] # creating a vector of color based on the subgroups color <- rep(0, length(group)) for (i in 1:length(group)){ if (group[i] == "P") {color[i] <- "red"} if (group[i] == "N") {color[i] <- "blue"} } color # PCA graphs # proportion explained by the first componentes axispcs <- paste((round(svdG$d[1:14]^2/sum(svdG$d^2)*100))[1:3], "%", sep = "") axispcs # 3D graph library(scatterplot3d) scatterplot3d(pcsG[,1], pcsG[,2], pcsG[,3], xlab = axispcs[1], ylab = axispcs[2], zlab = axispcs[3], axis = TRUE, color = color, highlight.3d = FALSE, box = TRUE, angle = 50) # 2D graph plot(x = pcsG[,1], y = pcsG[,2], xlab = axispcs[1], ylab = axispcs[2], col = color, main = "PC 1 vs PC 2") legend("topright", pch = "o", col = c("red", "blue"), c("P", "N"), bty ="p", cex =.7, box.col = "grey") ################################ # prepare hierarchical clusters ############################### hc <- hclust(dist(genotypes)) # very simple dendrogram plot(hc) # using dendrogram objects hcd <- as.dendrogram(hc) # alternative way to get a dendrogram plot(hcd, type = "triangle") library(ape) plot(as.phylo(hc), type = "fan", tip.color = color) plot(as.phylo(hc), type = "unrooted", tip.color = color) ########################## # signatures of selection ########################## # FST method M <- t(genotypes) # calculate frequencies X <- matrix(NA, nrow(M), 2) colnames(X) <- c("P","N") # P X[,1] <- apply(M[,1:7],1,function(x) sum(x)/(length(x)*2)) # N X[,2] <- apply(M[,8:14],1,function(x) sum(x)/(length(x)*2)) # FST meansX <- rowMeans(X) # average allele frequency across populations alleleVar <- meansX * (1 - meansX) # p*q variance meanDevX <- X - meansX # deviation of each population from mean FST <- meanDevX^2 / alleleVar # deviation squared divided by var rownames(FST) <- colnames(genotypes) head(FST) # creting the graphs install.packages("lokern") library(lokern) smoothP = lokerns(FST[,1], n.out = 21) #26 points, each one with 10 SNP smoothN = lokerns(FST[,2], n.out = 21) par(mfrow=c(1,2)) plot(FST[,1], type = "l",xlab = "SNP",ylab = "Fst",col = "gray", main = "P") lines(smoothP$est, type = "l", col = "red",lwd = 1) plot(FST[,2], type = "l",xlab = "SNP",ylab = "Fst",col = "gray", main = "N") lines(smoothN$est,type = "l", col = "red",lwd = 1) dev.off() ####################### # opening the black box ####################### # Quality control #source("https://bioconductor.org/biocLite.R") #biocLite("impute") #devtools::install_github(repo = 'italo-granato/snpReady', ref = 'dev') library(snpReady) args(popgen) # datasets genotypes[1:14,1:6] group #estimating pop gen parameters, for individuals, markers and subgroups population <- popgen(genotypes, subgroups = group, plot = TRUE) # whole population population$whole$Markers[1:14,] population$whole$Genotypes population$whole$Population population$whole$Variability # within groups (P example) population$bygroup$P$Markers[1:14,] population$bygroup$P$Genotypes population$bygroup$P$Population population$bygroup$P$Variability population$bygroup$P$exclusive population$bygroup$P$fixed # Fst population$bygroup$F.stats$Genotypes population$bygroup$F.stats$Markers[1:14,] # Also, there are many graphs in your folder... ############################# ############################