##########################
# Lab 2 - Population structure based on markers
##########################

# loading the files
genotypes <- readRDS("M")
genotypes[1:14, 1:10]  

# considering just the parents
genotypes <- genotypes[1:14,]
dim(genotypes)

# this parantes repret two sources of germplasm N and P
group <- as.factor(rep(c("P", "N"), each = 7))
group

# svd decomposition - by individuals
svdG <- svd(genotypes, nu = 14, nv = 216)
plot(cumsum((svdG$d[1:20])^2/sum(svdG$d^2)), ylab = "proportion accumulated", xlab = "number of individuals", col = "red")

# obtainig the eigenvectors and eigenvalues
pcsG <- genotypes %*% svdG$v
rownames(pcsG) <- rownames(genotypes)
dim(pcsG)
pcsG[1:14,1:6]

# creating a vector of color based on the subgroups
color <- rep(0, length(group))
for (i in 1:length(group)){
  if (group[i] == "P") {color[i] <- "red"}
  if (group[i] == "N") {color[i] <- "blue"}
}
color


# PCA graphs
# proportion explained by the first componentes
axispcs <- paste((round(svdG$d[1:14]^2/sum(svdG$d^2)*100))[1:3], "%", sep = "")
axispcs

# 3D graph 
library(scatterplot3d)
scatterplot3d(pcsG[,1], pcsG[,2], pcsG[,3], xlab = axispcs[1], ylab = axispcs[2], zlab = axispcs[3], axis = TRUE, color = color, highlight.3d = FALSE, box = TRUE, angle = 50)

# 2D graph
plot(x = pcsG[,1], y = pcsG[,2], xlab = axispcs[1], ylab = axispcs[2], col = color, main = "PC 1 vs PC 2")
legend("topright", pch = "o", col = c("red", "blue"), c("P", "N"), bty ="p", cex =.7, box.col = "grey")


################################ 
# prepare hierarchical clusters
###############################

hc <- hclust(dist(genotypes))
# very simple dendrogram
plot(hc)

# using dendrogram objects
hcd <- as.dendrogram(hc)
# alternative way to get a dendrogram
plot(hcd, type = "triangle")

library(ape)
plot(as.phylo(hc), type = "fan", tip.color = color)
plot(as.phylo(hc), type = "unrooted", tip.color = color)

##########################
# signatures of selection 
##########################
# FST method

M <- t(genotypes)

# calculate frequencies
X <- matrix(NA, nrow(M), 2)
colnames(X) <- c("P","N")

# P
X[,1] <- apply(M[,1:7],1,function(x) sum(x)/(length(x)*2))

# N
X[,2] <- apply(M[,8:14],1,function(x) sum(x)/(length(x)*2))

# FST
meansX <- rowMeans(X) # average allele frequency across populations
alleleVar <- meansX * (1 - meansX) # p*q variance
meanDevX <- X - meansX # deviation of each population from mean
FST <- meanDevX^2 / alleleVar # deviation squared divided by var
rownames(FST) <- colnames(genotypes)
head(FST)

# creting the graphs
install.packages("lokern")
library(lokern)
smoothP = lokerns(FST[,1], n.out = 21) #26 points, each one with 10 SNP
smoothN = lokerns(FST[,2], n.out = 21)

par(mfrow=c(1,2))
plot(FST[,1], type = "l",xlab = "SNP",ylab = "Fst",col = "gray", main = "P")
lines(smoothP$est, type = "l", col = "red",lwd = 1)

plot(FST[,2], type = "l",xlab = "SNP",ylab = "Fst",col = "gray", main = "N")
lines(smoothN$est,type = "l", col = "red",lwd = 1)
dev.off()


#######################
# opening the black box
#######################

# Quality control
#source("https://bioconductor.org/biocLite.R")
#biocLite("impute")

#devtools::install_github(repo = 'italo-granato/snpReady', ref = 'dev')
library(snpReady)

args(popgen)

# datasets 
genotypes[1:14,1:6]
group

#estimating pop gen parameters, for individuals, markers and subgroups
population <- popgen(genotypes, subgroups = group, plot = TRUE)

# whole population
population$whole$Markers[1:14,]
population$whole$Genotypes
population$whole$Population
population$whole$Variability

# within groups (P example)
population$bygroup$P$Markers[1:14,]
population$bygroup$P$Genotypes
population$bygroup$P$Population
population$bygroup$P$Variability
population$bygroup$P$exclusive
population$bygroup$P$fixed


# Fst
population$bygroup$F.stats$Genotypes
population$bygroup$F.stats$Markers[1:14,]


# Also, there are many graphs in your folder...

#############################
############################