########################## # Lab 1 - Quality control ########################## ############# # Phenotypes ############# # loading the phenotyping file pheno <- read.table("pheno.txt", header = T, na.strings = "NA") head(pheno) # show the first six rows tail(pheno) # show the last six rows str(pheno) # present the flies' structure # adjusting the factors pheno$gid <- as.factor(pheno$gid) # transform it into a factor pheno$female <- as.factor(pheno$female) pheno$male <- as.factor(pheno$male) pheno$rep <- as.factor(pheno$rep) str(pheno) #saving the newest file saveRDS(pheno, "pheno") # this kind of file is much better for R # loading again... pheno <- readRDS("pheno") head(pheno) # correlation among traits round(cor(pheno[,7:9]),2) #install.packages("PerformanceAnalytics") library("PerformanceAnalytics") chart.Correlation(as.matrix(pheno[,7:9]), histogram = TRUE, pch = 1) # identifying outliers boxplot(pheno$MPS, col = "red") #install.packages("lme4") library(lme4) fit <- lm(MPS ~ 1, data = pheno) #install.packages("car") library(car) outlierTest(fit) # Bonferroni p‐value forthe most extreme observations outlier.MPS <- boxplot.stats(pheno$MPS)$out # outlier values outlier.MPS # removing the outliers pheno2 <- pheno[!pheno$MPS %in% outlier.MPS,] head(pheno2) dim(pheno) dim(pheno2) # testing for normality # First lets check using patterns shapiro.test(rnorm(length(pheno2$SRA))) # normal distribution shapiro.test(runif(length(pheno2$SRA))) # uniform distribution # then, shapiro.test(pheno2$SRA) # install.packages("bestNormalize") require(bestNormalize) MPSadj <- bestNormalize(pheno2$MPS, standardize = FALSE, allow_orderNorm = TRUE, out_of_sample = FALSE) MPSadj$chosen_transform shapiro.test(MPSadj$x.t) pheno2$MPSadj <- MPSadj$x.t par(mfrow = c(1,2)) # organize the plot window in 1 row and 2 col hist(pheno2$MPS, col = "red", main = "MPS", xlab = "MPS") hist(pheno2$MPSadj, col = "blue", main = "Adjusted MPS", xlab = "Adjusted MPS") # What about the residuals? #install.packages("lme4") #library(lme4) fit <- lm(MPS ~ 1, data = pheno2) fit2 <- lm(MPS ~ type + N + rep, data = pheno2) fit3 <- lm(MPSadj ~ type + N + rep, data = pheno2) # Quartile‐Quartile(Q‐Q) normality plot for residuals par(mfrow = c(1,3)) qqnorm(resid(fit)) qqline(resid(fit), col = "red") qqnorm(resid(fit2)) qqline(resid(fit2), col = "blue") qqnorm(resid(fit3)) qqline(resid(fit3), col = "green") #double check for the sceneario shapiro.test(fit3$residuals) # saving the newest pheno file str(pheno2) saveRDS(pheno2, "pheno2") ############## # Markers ############## # loading the files geno <- read.table("geno.txt", header = TRUE, row.names = 1) geno[1:8, 1:8] # first 8 rows and col tail(geno[, 1:8]) dim(geno) # gives the dimensions of the dataset map <- read.table("map.txt", header = TRUE) rownames(map) <- map$marker # replace the rownames head(map) tail(map) dim(map) str(map) map$chr <- as.factor(map$chr) str(map) all(colnames(geno) == rownames(map)) # check if they are equal # Quality control #source("https://bioconductor.org/biocLite.R") #biocLite("impute") #devtools::install_github(repo = 'italo-granato/snpReady', ref = 'dev') library(snpReady) args(raw.data) QC <- raw.data(data = as.matrix(geno), frame = "wide", hapmap = map, maf = 0.05, call.rate = 0.90, base = TRUE, imput = TRUE, imput.type = "knni", outfile = "012", plot = TRUE) # The report of the quality control approaches QC$report # get the newest dataset of markers M <- QC$M.clean dim(M) M[1:10, 1:10] # and the hapmap hapmap <- QC$Hapmap dim(hapmap) head(hapmap) ################### # LD decay and prunning M2 <- M M2[M2 == "1"] <- "AB" M2[M2 == "2"] <- "AA" M2[M2 == "0"] <- "BB" M2[1:8,1:8] map2 <- hapmap[,2:3] colnames(map2) <- c("chr","pos") map2$chr <- as.numeric(map2$chr) head(map2) #install.packages("synbreed") library(synbreed) gpdata <- create.gpData(geno = M2,map = map2, reorderMap = F, map.unit = "bp") data <- codeGeno(gpData = gpdata,impute = F,maf = NULL, nmiss = NULL, label.heter = "AB") # estimating the LD between markers LDs <- pairwiseLD(data, chr = c(1:10), type = "data.frame", cores = 2) head(LDs$chr_1) # plot the LD decay by chromossome par(mfrow = c(5,2)) for (i in 1:length(names(LDs))) { LDDist(LDs, chr = i) } dev.off() # which one is high correlated on chromossome 1? LDs$chr_1[LDs$chr_1$r2 > 0.99,] # doing the same but for all chromossomes - threshold of 0.99 prune <- c() # identify all markers in high LD for (i in 1:length(names(LDs))) { markers <- LDs[[i]][LDs[[i]]$r2 > 0.99,]$marker2 prune <- c(prune, markers) } # number and whhat markers will be eliminated length(prune) prune M3 <- M[,!colnames(M) %in% prune] dim(M3) M3[1:6, 1:17] saveRDS(M3, "M") # and the hapmap hapmap2 <- hapmap[!rownames(hapmap) %in% prune,] dim(hapmap2) head(hapmap2) saveRDS(hapmap2, "hapmap") # double check all(colnames(M3) == rownames(hapmap2))