##########################
# Lab 1 - Quality control
##########################

#############
# Phenotypes
#############

# loading the phenotyping file
pheno <- read.table("pheno.txt", header = T, na.strings = "NA")
head(pheno) # show the first six rows
tail(pheno) # show the last six rows
str(pheno) # present the flies' structure

# adjusting the factors
pheno$gid <- as.factor(pheno$gid) # transform it into a factor
pheno$female <- as.factor(pheno$female)
pheno$male <- as.factor(pheno$male)
pheno$rep <- as.factor(pheno$rep)
str(pheno)

#saving the newest file
saveRDS(pheno, "pheno") # this kind of file is much better for R

# loading again...
pheno <- readRDS("pheno")
head(pheno)

# correlation among traits
round(cor(pheno[,7:9]),2)
#install.packages("PerformanceAnalytics")
library("PerformanceAnalytics")
chart.Correlation(as.matrix(pheno[,7:9]), histogram = TRUE, pch = 1)

# identifying outliers
boxplot(pheno$MPS, col = "red")
#install.packages("lme4")
library(lme4)
fit <- lm(MPS ~ 1, data = pheno)
#install.packages("car")
library(car)
outlierTest(fit) # Bonferroni p‐value forthe  most extreme observations
outlier.MPS <- boxplot.stats(pheno$MPS)$out # outlier values
outlier.MPS

# removing the outliers 
pheno2 <- pheno[!pheno$MPS %in% outlier.MPS,]
head(pheno2)
dim(pheno)
dim(pheno2)

# testing for normality
# First lets check using patterns
shapiro.test(rnorm(length(pheno2$SRA))) # normal distribution
shapiro.test(runif(length(pheno2$SRA))) # uniform distribution
# then, 
shapiro.test(pheno2$SRA)


# install.packages("bestNormalize")
require(bestNormalize)
MPSadj <- bestNormalize(pheno2$MPS, standardize = FALSE, allow_orderNorm = TRUE, out_of_sample = FALSE)
MPSadj$chosen_transform
shapiro.test(MPSadj$x.t)
pheno2$MPSadj <- MPSadj$x.t

par(mfrow = c(1,2)) # organize the plot window in 1 row and 2 col
hist(pheno2$MPS, col = "red", main = "MPS", xlab = "MPS")
hist(pheno2$MPSadj, col = "blue", main = "Adjusted MPS", xlab = "Adjusted MPS")


# What about the residuals?
#install.packages("lme4")
#library(lme4)
fit <- lm(MPS ~ 1, data = pheno2)
fit2 <- lm(MPS ~ type + N + rep, data = pheno2)
fit3 <- lm(MPSadj ~ type + N + rep, data = pheno2)

# Quartile‐Quartile(Q‐Q) normality plot for residuals
par(mfrow = c(1,3))
qqnorm(resid(fit))
qqline(resid(fit), col = "red")
qqnorm(resid(fit2))
qqline(resid(fit2), col = "blue")
qqnorm(resid(fit3))
qqline(resid(fit3), col = "green")

#double check for the sceneario
shapiro.test(fit3$residuals)

# saving the newest pheno file
str(pheno2)
saveRDS(pheno2, "pheno2")


##############
# Markers
##############

# loading the files
geno <- read.table("geno.txt", header = TRUE, row.names = 1)
geno[1:8, 1:8] # first 8 rows and col
tail(geno[, 1:8])
dim(geno) # gives the dimensions of the dataset

map <- read.table("map.txt", header = TRUE)
rownames(map) <- map$marker # replace the rownames
head(map)
tail(map)
dim(map)
str(map)
map$chr <- as.factor(map$chr)
str(map)

all(colnames(geno) == rownames(map)) # check if they are equal

# Quality control
#source("https://bioconductor.org/biocLite.R")
#biocLite("impute")
#devtools::install_github(repo = 'italo-granato/snpReady', ref = 'dev')
library(snpReady)

args(raw.data)

QC <- raw.data(data = as.matrix(geno), frame = "wide", hapmap = map, maf = 0.05, call.rate = 0.90, base = TRUE, imput = TRUE, imput.type = "knni", outfile = "012", plot = TRUE)


# The report of the quality control approaches
QC$report

# get the newest dataset of markers
M <- QC$M.clean
dim(M)
M[1:10, 1:10]

# and the hapmap
hapmap <- QC$Hapmap
dim(hapmap)
head(hapmap)

###################

# LD decay and prunning
M2 <- M
M2[M2 == "1"] <- "AB"
M2[M2 == "2"] <- "AA"
M2[M2 == "0"] <- "BB"
M2[1:8,1:8]

map2 <- hapmap[,2:3]
colnames(map2) <- c("chr","pos")
map2$chr <- as.numeric(map2$chr) 
head(map2)

#install.packages("synbreed")
library(synbreed)

gpdata <- create.gpData(geno = M2,map = map2, reorderMap = F, map.unit = "bp")
data <- codeGeno(gpData = gpdata,impute = F,maf = NULL, nmiss = NULL, label.heter = "AB")

# estimating the LD between markers
LDs <- pairwiseLD(data, chr = c(1:10), type = "data.frame", cores = 2)
head(LDs$chr_1)

# plot the LD decay by chromossome
par(mfrow = c(5,2))
for (i in 1:length(names(LDs))) {
  LDDist(LDs, chr = i) 
}
dev.off()

# which one is high correlated on chromossome 1?
LDs$chr_1[LDs$chr_1$r2 > 0.99,]

# doing the same but for all chromossomes - threshold of 0.99
prune <- c()
# identify all markers in high LD 
for (i in 1:length(names(LDs))) {
  markers <- LDs[[i]][LDs[[i]]$r2 > 0.99,]$marker2
  prune <- c(prune, markers)
}

# number and whhat markers will be eliminated
length(prune)
prune

M3 <- M[,!colnames(M) %in% prune]
dim(M3)
M3[1:6, 1:17]
saveRDS(M3, "M")

# and the hapmap
hapmap2 <- hapmap[!rownames(hapmap) %in% prune,]
dim(hapmap2)
head(hapmap2)
saveRDS(hapmap2, "hapmap")

# double check
all(colnames(M3) == rownames(hapmap2))