# subset dat by samples of interest cl <- as.character(ann[,2]) dat <- dat[,cl] # two classes of DLBCL determined from HCA in paper gc <- cl[1:19] act <- cl[20:39] # calculate means of each class for each gene gc.m <- apply(dat[,gc],1,mean,na.rm=T) act.m <- apply(dat[,act],1,mean,na.rm=T) # get fold changes (log scale) fold <- gc.m-act.m # function to calculate Student’s two-sample t-test on all genes at once # function returns the p-value for the test # NAs are removed for each test t.test.all.genes <- function(x,s1,s2) { x1 <- x[s1] x2 <- x[s2] x1 <- as.numeric(x1) x2 <- as.numeric(x2) t.out <- t.test(x1,x2, alternative=“two.sided”,var.equal=T) out <- as.numeric(t.out$p.value) return(out) } # s1 and s2 are dimensions of the two samples # run function on each gene in the data frame t.test.run <- apply(dat,1,t.test.all.genes,s1=gc,s2=act) # look at distribution of p-values hist(t.test.run,col=‘blue’) # transpose p-values p.trans <- -1 * log(t.test.run) # volcano plot #1 plot(range(p.trans),range(fold),type=‘n’,xlab=‘-1*log(p-value)’,ylab=‘fold change’,main=‘Volcano Plot’) points(p.trans,fold,col=‘black’) points(p.trans[(p.trans>3&fold>0.7)],fold[(p.trans>3&fold>0.7)],col=‘red’,pch=16) points(p.trans[(p.trans>3&fold< -0.7)],fold[(p.trans>3&fold< -0.7)],col=‘green’,pch=16) abline(v=3) abline(h=-0.7) abline(h=0.7) # one factor ANVOVA model with yarn breaks (during weaving) data set data(warpbreaks) summary(fm1 <- aov(breaks ~ tension, data = warpbreaks)) # plot of Tukey’s honestly significant difference intervals TukeyHSD(fm1, "tension", ordered = TRUE) plot(TukeyHSD(fm1, "tension")) # second volcano plot library(Biobase); library(annotate); library(golubEsets); data(geneData); dat <- geneData # floor data to 10 dat[dat<10] <- 10 fold <- apply(log(dat[,c(1:5)]),1,mean) - apply(log(dat[,c(14:18)]),1,mean) t.test.run <- apply(dat,1,t.test.all.genes,s1=c(1:5),s2=c(14:18)) t.test.run[is.na(t.test.run)]<-1 # transpose p-values p.trans <- -1 * log(t.test.run) # volcano plot #2 plot(range(p.trans),range(fold),type='n',xlab='-1*log(p-value)',ylab='fold change',main='Volcano Plot') points(p.trans,fold,col='black') points(p.trans[(p.trans>3&fold>0.7)],fold[(p.trans>3&fold>0.7)],col='red',pch=16) points(p.trans[(p.trans>3&fold< -0.7)],fold[(p.trans>3&fold< -0.7)],col='green',pch=16) text(p.trans[p.trans>3],fold[p.trans>3],labels=dimnames(dat)[[1]][p.trans[p.trans>3]],cex=0.65) abline(v=3) abline(h=-0.7) abline(h=0.7) # PLS gene selection with Spellman et al. yeast data library(yeastCC) library(Biobase) library(annotate) library(gpls) data(yeastCC) dat <- exprs(yeastCC) dat<-dat[,23:46] # only isolate the cdc15 experiment dat.m <- apply(dat,1,mean) dat <- dat[!is.na(dat.m),] # only retain genes with values y<-c(rep(1,12),rep(0,12)) # specify gene pattern # run pls to get gene weights for response pls.out <- glpls1a(t(dat),y,br=FALSE, denom.eps=0.0001)$coefficients pls.out <- pls.out[-1] # remove intercept # look at distribution of gene weights hist(pls.out,col='pink') # look at most positive and most negative gene profiles par(mfrow=c(2,1)) plot(c(1:24),dat[pls.out==max(pls.out),],type='b',col='red',xlab='samples',ylab='expression',main='Highe st gene weight') plot(c(1:24),dat[pls.out==min(pls.out),],type='b',col='red',xlab='samples',ylab='expression',main='Lowes t gene weight')