if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("tximport")
BiocManager::install("DESeq2")
install.packages("VennDiagram")
install.packages("mclust")
library(tximport)
library(VennDiagram)
library(DESeq2)
library(reshape2)
library(pheatmap)
library(ggplot2)
library(mclust)

rm(list=ls())
setwd("~/RNASeqPracticalTranscriptLevels/")
targets<-read.csv("RNASeqTranscriptLevelsCleanDATA/target.csv",header=TRUE)
targets
rownames(targets)<-targets$SampleName
targets

files <- paste("~/RNASeqPracticalTranscriptLevels/Salmon/", targets$SampleName, "_salmon/quant.sf",sep="")
files
tx2gene<-read.delim("ArabidopsiscDNAINDEX/tx2gene.txt",header=FALSE)
tx2gene
names(files) <- targets$SampleName
files
all(file.exists(files))
txi.salmon <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut=FALSE)

head(txi.salmon$abundance)

Arabtpm<-txi.salmon$abundance

dds <- DESeqDataSetFromTximport(txi.salmon, colData = targets, design = ~Condition)
dds$Condition <-relevel(dds$Condition, ref='MOCK')
head(dds)
dds <- DESeq(dds)

vsd <- vst(dds, blind=FALSE)

#Check distribution of samples in a PCA, showing the top 500 varying genes
plotPCA(vsd, intgroup=c("Genotype"),ntop=500)
plotPCA(vsd, intgroup=c("Condition"),ntop=500)
plotPCA(vsd, intgroup=c("Condition","Genotype"),ntop=500)

#Show clustering of samples, should be in agreement with PCA
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Condition, vsd$Genotype, sep="-")
colnames(sampleDistMatrix) <- paste(vsd$Condition, vsd$Genotype, sep="-")
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists)
#Getting differentially expressed genes, comparisons against control (MOCK)
res_SALT_vs_MOCK <- results(dds, lfcThreshold=1, altHypothesis="greaterAbs", contrast = c('Condition','SALT','MOCK'), alpha=0.05)
res_ABA_vs_MOCK <- results(dds, lfcThreshold=1, altHypothesis="greaterAbs", contrast = c('Condition','ABA','MOCK'), alpha=0.05)
res_DEHYD_vs_MOCK <- results(dds, lfcThreshold=1, altHypothesis="greaterAbs", contrast = c('Condition','DEHYD','MOCK'), alpha=0.05)

summary(res_SALT_vs_MOCK)
summary(res_ABA_vs_MOCK)
summary(res_DEHYD_vs_MOCK)

sig_SALT_vs_MOCK<-res_SALT_vs_MOCK[which(res_SALT_vs_MOCK$padj<0.05),]
dim(sig_SALT_vs_MOCK)
head(sig_SALT_vs_MOCK)
sig_ABA_vs_MOCK<-res_ABA_vs_MOCK[which(res_ABA_vs_MOCK$padj<0.05),]
dim(sig_ABA_vs_MOCK)
head(sig_ABA_vs_MOCK)
sig_DEHYD_vs_MOCK<-res_DEHYD_vs_MOCK[which(res_DEHYD_vs_MOCK$padj<0.05),]
dim(sig_DEHYD_vs_MOCK)
head(sig_DEHYD_vs_MOCK)

#Exploring DGE results
dev.off()
drawLines <- function() abline(h=c(-1,1),col="dodgerblue",lwd=2)
plotMA(res_ABA_vs_MOCK); drawLines()
plotMA(res_SALT_vs_MOCK); drawLines()
plotMA(res_DEHYD_vs_MOCK); drawLines()

#Let's plot the vulcano plots seen in class

res_ABA_vs_MOCK_df<-as.data.frame(res_ABA_vs_MOCK)
res_ABA_vs_MOCK_df$Sig<-'NotSig'
res_ABA_vs_MOCK_df[which(res_ABA_vs_MOCK_df$padj < 0.05),'Sig']<-'Yes'
ggplot(res_ABA_vs_MOCK_df,aes(x=log2FoldChange, y=log(padj,base=10)*-1, colour=Sig))+
  theme_bw()+
  geom_point()#+
#  ylim(0,5)+
#  xlim(-2,2)

#Getting the TPM for the DE genes
TPM_SALT_vs_MOCK<-Arabtpm[which(rownames(Arabtpm) %in% 
                                  rownames(sig_SALT_vs_MOCK)),
                          as.character(targets[which(targets$Condition %in%
                                                       c('SALT','MOCK')),
                                               'SampleName'])]
head(Arabtpm)
dim(Arabtpm)
dim(TPM_SALT_vs_MOCK)
head(TPM_SALT_vs_MOCK)
TPM_ABA_vs_MOCK<-Arabtpm[which(rownames(Arabtpm) %in% rownames(sig_ABA_vs_MOCK)),
                         as.character(targets[which(targets$Condition %in% c('ABA','MOCK')),'SampleName'])]

TPM_DEHYD_vs_MOCK<-Arabtpm[which(rownames(Arabtpm) %in% rownames(sig_DEHYD_vs_MOCK)),
                           as.character(targets[which(targets$Condition %in% c('DEHYD','MOCK')),'SampleName'])]

annot_col= data.frame(Genotype=factor(targets$Genotype),
                      Condition=factor(targets$Condition))
rownames(annot_col)<-targets$SampleName
pheatmap(TPM_SALT_vs_MOCK, scale='row', annotation_col = annot_col)
pheatmap(TPM_ABA_vs_MOCK, scale='row', annotation_col = annot_col)
pheatmap(TPM_DEHYD_vs_MOCK, scale='row', annotation_col = annot_col)

#CreATE A vENN DIAGRAM WITH degS
#https://www.r-graph-gallery.com/14-venn-diagramm.html


# Prepare a palette of 3 colors with R colorbrewer:
library(RColorBrewer)
myCol <- brewer.pal(3, "Pastel2")
venn.diagram(x=list(rownames(TPM_SALT_vs_MOCK),rownames(TPM_ABA_vs_MOCK),rownames(TPM_DEHYD_vs_MOCK)),
             category.names = c("SALT_vs_MOCK" , "ABA_vs_MOCK " , "DEHYD_vs_MOCK"),
             filename='VennDiagramDEGsimple.png',
             imagetype = 'png',
             output=TRUE)


venn.diagram(x=list(rownames(TPM_SALT_vs_MOCK),rownames(TPM_ABA_vs_MOCK),rownames(TPM_DEHYD_vs_MOCK)),
             category.names = c("SALT_vs_MOCK" , "ABA_vs_MOCK " , "DEHYD_vs_MOCK"),
             filename='VennDiagramDEG.png',
             imagetype = 'png',
             output=TRUE,
             
             # Circles
             lwd = 2,
             lty = 'blank',
             fill = myCol,
             
             # Set names
             cat.cex = 0.6,
             cat.fontface = "bold",
             cat.default.pos = "outer",
             cat.pos = c(-27, 27, 135),
             cat.dist = c(0.055, 0.055, 0.085),
             cat.fontfamily = "sans",
             rotation = 1)

#Clustering expression profiles
#let's get all the TPM for all the DEG

a<-union(rownames(TPM_SALT_vs_MOCK),rownames(TPM_ABA_vs_MOCK))
b<-union(rownames(TPM_DEHYD_vs_MOCK),a)
length(b)

TPM_DEGs<-as.data.frame(Arabtpm[which(rownames(Arabtpm) %in% b),])
dim(TPM_DEGs)
head(TPM_DEGs)
targets
TPM_DEG_means<-as.data.frame(matrix(ncol=4,nrow=nrow(TPM_DEGs),data = NA))
colnames(TPM_DEG_means)<-c('DEHYD','SALT','ABA','MOCK')

TPM_DEG_means$DEHYD<-rowMeans(TPM_DEGs[,as.character(targets[which(targets$Condition=='DEHYD'),'SampleName'])])
TPM_DEG_means$SALT<-rowMeans(TPM_DEGs[,as.character(targets[which(targets$Condition=='SALT'),'SampleName'])])
TPM_DEG_means$ABA<-rowMeans(TPM_DEGs[,as.character(targets[which(targets$Condition=='ABA'),'SampleName'])])
TPM_DEG_means$MOCK<-rowMeans(TPM_DEGs[,as.character(targets[which(targets$Condition=='MOCK'),'SampleName'])])
rownames(TPM_DEG_means)<-rownames(TPM_DEGs)

head(TPM_DEG_means)
head(TPM_DEGs)
scale(TPM_DEG_means)

Zval<-function(x){
  xmean<-mean(x)
  xsd<-sd(x)
  z<-(x-xmean)/xsd
  return(z)
}

ArabZval<-as.data.frame(t(apply(TPM_DEG_means, 1, Zval)))
head(ArabZval)
head(ArabZval)
head(TPM_DEG_means)

ArabClustering<-Mclust(ArabZval,G=35) #In real life we need to apply different technique to choose the proper number of clusters

#Number of genes per cluster
ArabClusters<-as.data.frame(ArabClustering$classification)
head(ArabClusters)
ArabClusters$Gene<-rownames(ArabClusters)
colnames(ArabClusters)<-c('Cluster','Gene')
table(ArabClusters$Cluster)

#Plotting the expression profile of genes in a given cluster
Cluster11<-ArabZval[which(rownames(ArabZval) %in% ArabClusters[which(ArabClusters$Cluster == 11),"Gene"]),]
head(Cluster11)
Cluster11<-as.data.frame(Cluster11)
Cluster11$Gene<-rownames(Cluster11)
head(Cluster11)
Cluster11_melt<-melt(Cluster11,id.vars = 'Gene')
head(Cluster11_melt)
ggplot(Cluster11_melt, aes(y=value,x=variable,group=Gene))+
  geom_line(colour='black')+
  theme_bw()+
  ylab('Relative expression value')+
  xlab('Condition')

ggplot(Cluster11_melt, aes(y=value,x=variable,fill=variable))+
  geom_boxplot()+
  theme_bw()+
  ylab('Relative expression value')+
  xlab('Condition')