########################## # Lab 1 - Quality control ########################## ############# # Phenotypes ############# # loading the phenotyping file pheno <- read.table("pheno.txt", header = T, na.strings = "NA") head(pheno) tail(pheno) str(pheno) # adjusting the factors pheno$gid <- as.factor(pheno$gid) pheno$female <- as.factor(pheno$female) pheno$male <- as.factor(pheno$male) pheno$rep <- as.factor(pheno$rep) str(pheno) #saving the newest file saveRDS(pheno, "pheno") # loading again... pheno <- readRDS("pheno") head(pheno) # identifying outliers boxplot(pheno$MPS, col = "red") install.packages("lme4") library(lme4) fit <- lm(MPS ~ 1, data = pheno) install.packages("car") library(car) outlierTest(fit) # Bonferroni p‐value for most extreme observation for MPS outlier.MPS <- boxplot.stats(pheno$MPS)$out # outlier values for MPS outlier.MPS # removing the outliers pheno2 <- pheno[!pheno$MPS %in% outlier.MPS,] head(pheno2) dim(pheno) dim(pheno2) # testing for normality shapiro.test(pheno2$MPS) # lets check using patterns shapiro.test(rnorm(length(pheno2$MPS))) shapiro.test(runif(length(pheno2$MPS))) # transforming the data for normality by BoxCox MPSadj <- powerTransform(pheno2$MPS, family = "bcPower") MPSadj$lambda boxCox(pheno2$MPS~1) pheno2$MPSadj <- as.numeric(bcPower(pheno2$MPS, MPSadj$lambda)) # Let's see the results head(pheno2) shapiro.test(pheno2$MPSadj) par(mfrow = c(1,2)) hist(pheno2$MPS) hist(pheno2$MPSadj) dev.off() # Another way to transform phenotypes install.packages("GenABEL") library(GenABEL) pheno2$MPSadj2 <- as.numeric(rntransform(pheno2$MPS)) shapiro.test(pheno2$MPSadj2) par(mfrow = c(1,3)) hist(pheno2$MPS) hist(pheno2$MPSadj) hist(pheno2$MPSadj2) dev.off() # Quartile‐Quartile(Q‐Q) normality plot for residuals qqnorm(resid(fit)) qqline(resid(fit), col = "red") saveRDS(pheno2, "pheno2") ############## # Markers ############## # loading the files geno <- read.table("geno.txt", header = TRUE, row.names = 1) geno[1:10, 1:20] tail(geno[, 1:20]) dim(geno) map <- read.table("map.txt", header = TRUE) rownames(map) <- map$marker head(map) tail(map) dim(map) str(map) map$chr <- as.factor(map$chr) str(map) all(colnames(geno) == rownames(map)) # Quality control source("https://bioconductor.org/biocLite.R") biocLite("impute") devtools::install_github(repo = 'italo-granato/snpReady', ref = 'dev') library(snpReady) args(raw.data) QC <- raw.data(data = as.matrix(geno), frame = "wide", hapmap = map, maf = 0.05, call.rate = 0.90, base = FALSE, imput = TRUE, imput.type = "knni", plot = TRUE) # The report of the quality control approaches QC$report # get the newest dataset of markers M <- QC$M.clean dim(M) M[1:10, 1:10] saveRDS(M, "M") QC$Hapmap # and the hapmap hapmap <- QC$Hapmap dim(hapmap) head(hapmap) saveRDS(hapmap, "hapmap") #############################