# please run following on metacentrum #qsub -l walltime=2:0:0 -q default -l select=1:ncpus=4:mem=12gb:scratch_local=10gb -I #module avail R #module load R-4.0.0-gcc #################################################################################################### #################################################################################################### # # Script to calculate differential gene expression using DESeq2 package # Designed for mRNA differential gene expression analysis based on Ensembl results using featureCounts, htseq-count, STAR counts # #################################################################################################### ############################## ALWAYS CHECK DESIGNS AND CONTRASTS!!!! ############################## #################################################################################################### # # INPUT_COUNTS - table with raw genes counts -> first column = gene id, first row = patient id # # ALWAYS CHECK DESIGN FOR DESEQ2 AND SELECT CORRECT COEF/CONTRAST BASED ON YOUR BIOL. QUESTION #################################################################################################### # General variables install.packages("gplots") # may ask to use user directory, after that from where it should download, type 28 install.packages("BiocManager") BiocManager::install("DESeq2") #################################################################################################### # PLEASE FILL YOUR OWN PATH INPUT_COUNTS<-"input_full/counts.txt" # Directory with genes.results OUTPUT_DIR<-"DE_old" SAMPLES_DESC<-"input_full/design.txt" # Samples description #################################################################################################### #show available unique(read.table(SAMPLES_DESC, header=T, sep="\t", stringsAsFactors = F)[,"condition"]) # Set comparison to be made condsToCompare<-c("untreat", "treat") #################################################################################################### # Custom variables for visualization P_THRESHOLD<-0.05 FOLD_CHANGE<-2 LFC_THRESHOLD<-log(FOLD_CHANGE, 2) TOP<-20 # How many top DE genes should be plotted ##################################### ### library for colors library("RColorBrewer") hmcol<-colorRampPalette(brewer.pal(9, "GnBu"))(100) #################################################################################################### #################################################################################################### ### Read description file samples_desc<-read.table(SAMPLES_DESC, header=T, stringsAsFactors = F, sep="\t") coldata <- read.csv(SAMPLES_DESC, sep = '\t', row.names = 1) ### Start assignment of conditions OUTPUT_DIR<-paste0(OUTPUT_DIR, "/", condsToCompare[2], "vs", condsToCompare[1]) dir.create(OUTPUT_DIR, recursive = T) ### Prepare for read count input mrcounts <- read.csv(INPUT_COUNTS, sep = '\t', row.names = 1) #################################################################################################### # Remove not expressed genes in any sample mrcounts<-mrcounts[rowSums(mrcounts)!=0,] #################################################################################################### setwd(OUTPUT_DIR) #################################################################################################### #visualization of number of input counts for particular sample library("RColorBrewer") #for each condition assign one colour cond_colours<-brewer.pal(length(unique(coldata$condition)), "Paired")[as.factor(coldata$condition)] names(cond_colours)<-coldata$condition #create pdf file which contains barplot of number of input counts pdf(file="counts_barplot.pdf", width=10, height=8) #adjust margin par(mar = c(7,5,4,2) + 0.1) #create barplot bp<-barplot(apply(mrcounts, 2, sum), las=2, col=cond_colours, ylim=c(0,(max(apply(mrcounts,2,sum)))*1.1)) #add number of reads to the plot text(bp, apply(mrcounts,2,sum), labels=apply(mrcounts, 2, sum), cex=1, pos=3) # https://stackoverflow.com/questions/27466035/adding-values-to-barplot-of-table-in-r dev.off() #################################################################################################### #################################################################################################### # DESeq2 part # Make the count object, normalise, dispersion and testing library("DESeq2") # ~0+condition means we are not using any intercept in calculation, when we have dependent before and after treatment patients intercept is on place; when # we compare independent groups "~0+" should be on place and condition means we are looking for difference between conditions dds<-DESeqDataSetFromMatrix(countData = mrcounts, colData = coldata, design = ~0+condition) #estimate sizefactors and dispersion dds<-DESeq(dds) cds<-dds #################################################################################################### #plot dispersion estimation to the pdf file pdf(file="DESeq2_disperison_plot.pdf") plotDispEsts(cds, main="Dispersion Plot") dev.off() # initiate variables containing raw counts, normalized counts (divide the counts by the size factors or normalization factors done by DESeq function) and log normalized counts rawcounts<-counts(cds, normalized=FALSE) # Save raw counts normcounts<-counts(cds, normalized=TRUE) # Save normalized counts log2counts<-log2(normcounts+1) # Save log2 of normalized counts # Normalization check - print pdf with raw counts and normalized counts pdf(file="pre_post_norm_counts.pdf") par(mfrow=c(3,1)) barplot(colSums(rawcounts), col=cond_colours, las=2,cex.names=1.3,main="Pre Normalised Counts", ylim=c(0,(max(apply(rawcounts,2,sum)))*1.1)) plot(1, type="n", axes=F, xlab="", ylab="") barplot(colSums(normcounts), col=cond_colours, las=2, cex.names=1.3, main="Post Normalised Counts", ylim=c(0,(max(apply(normcounts,2,sum)))*1.1)) dev.off() # Heatmaps -- shows us the counts correlations between samples library("gplots") pdf(file="heatmaps_samples.pdf") heatmap.2(cor(log2counts), trace="none", col=hmcol, main="Sample to Sample Correlation (Log2)", RowSideColors=cond_colours, margins=c(9,7)) heatmap.2(cor(rawcounts), trace="none", col=hmcol, main="Sample to Sample Correlation (Raw Counts)", RowSideColors=cond_colours, margins=c(9,7)) dev.off() #PCA analysis done on normalized counts pca<-princomp(normcounts) pdf(file="sample_to_sample_PCA.pdf") # First two components par(mfrow=c(1,1), xpd=NA) plot(pca$loadings, col=cond_colours, pch=19, cex=2, main="Sample to Sample PCA (normalized)") text(pca$loadings, as.vector(colnames(mrcounts)), pos=3) # Three components par(mfrow=c(1,3), oma=c(2,0,0,0), xpd=NA) # http://stackoverflow.com/questions/12402319/centring-legend-below-two-plots-in-r plot(pca$loadings[,c(1,2)], col=cond_colours, pch=19, cex=2, main="Sample to Sample PCA (normalized)", ylab="PC2", xlab="PC1") text(pca$loadings[,c(1,2)], as.vector(colnames(mrcounts)), pos=3) plot(pca$loadings[,c(1,3)], col=cond_colours, pch=19, cex=2, main="Sample to Sample PCA (normalized)", ylab="PC3", xlab="PC1") text(pca$loadings[,c(1,3)], as.vector(colnames(mrcounts)), pos=3) plot(pca$loadings[,c(2,3)], col=cond_colours, pch=19, cex=2, main="Sample to Sample PCA (normalized)", ylab="PC3", xlab="PC2") text(pca$loadings[,c(2,3)], as.vector(colnames(mrcounts)), pos=3) dev.off() pdf(file="contributions_PCA.pdf") plot(pca, type = "l", main="Principal Component Contributions") dev.off() #################################################################################################### # DESeq2 results #summarize the results from DESeq2 object, based on condition we choosed res<-results(dds, contrast=c("condition", condsToCompare[2], condsToCompare[1])) # Extract contrasts of interest #order the results according adjusted p-value and p-value res<-res[order(res$padj, res$pvalue),] # Extract significant results sig<-rownames(res[(abs(res$log2FoldChange) >= LFC_THRESHOLD) & (res$padj < P_THRESHOLD) & !is.na(res$padj),]) res2<-as.data.frame(res) # Add normalized counts to the data frame res2<-merge(res2, counts(dds, normalized=TRUE), by="row.names") rownames(res2)<-res2$Row.names #remove the artifact after merging with new columns res2$Row.names<-NULL # Add raw counts to the data frame res2<-merge(res2, counts(dds, normalized=FALSE), by="row.names") rownames(res2)<-res2$Row.names #remove the artifact after merging with new columns res2$Row.names<-NULL #order the dataframe according adjusted p-value and p-value, etc. res2<-res2[with(res2, order(padj, pvalue, abs(log2FoldChange), -baseMean)), ] #renaming of columns after merging colnames(res2)<-gsub(pattern = ".x", replacement = "_normCounts", fixed = T, colnames(res2)) colnames(res2)<-gsub(pattern = ".y", replacement = "_rawCounts", fixed = T, colnames(res2)) #write output table write.table(x = res2, file = "DESeq2.tsv", sep = "\t", col.names = NA) #################################################################################################### #plot volcano plot (foldchange vs adjusted p-value) pdf(file=paste("volcanoplot_", condsToCompare[1], "_vs_", condsToCompare[2],".pdf", sep="")) #plot foldchange vs adjusted p-value plot(res$log2FoldChange, -log(res$padj, 10), main=paste("Volcanoplot ",condsToCompare[2], " vs ", condsToCompare[1], " top ", TOP, " genes", sep=""), cex=0.4, pch=19,xlab = "log2FoldChange",ylab = "-log10 adjusted p-value") #add identifiers of top 20 differentialy expressed genes text(res[1:TOP,]$log2FoldChange, -log(res[1:TOP,]$padj,10), labels=rownames(res)[1:TOP], cex=0.3, pos=3) #add legend of conditions legend("bottomleft", condsToCompare[1], cex=0.5) legend("bottomright", condsToCompare[2], cex=0.5) #add visual lines of predefined cut-offs on foldchange and p-value abline(h=-log(P_THRESHOLD, 10), col="red", lty=2) abline(v=c(-LFC_THRESHOLD, LFC_THRESHOLD), col="darkblue", lty=2) dev.off() #plot MAplot (log2foldchange vs mean normlaized expression of gene) pdf(file=paste("MAplot_", condsToCompare[1], "_vs_", condsToCompare[2],".pdf", sep="")) #plot log2foldchange vs mean normlaized expression DESeq2::plotMA(res, alpha=P_THRESHOLD, main=paste("MA plot ",condsToCompare[2], " vs ", condsToCompare[1], " top ", TOP, " genes", sep=""), ylim=c(-max(abs(res$log2FoldChange), na.rm=T), max(abs(res$log2FoldChange), na.rm=T))) #add zero line abline(h=c(-LFC_THRESHOLD, LFC_THRESHOLD), col="darkblue", lty=2) #add identifiers of top 20 differentialy expressed genes text(res[1:TOP,]$baseMean, res[1:TOP,]$log2FoldChange, labels=rownames(res)[1:TOP], cex=0.4, pos=3) #add legend of conditions legend("bottomright", condsToCompare[1], cex=0.5) legend("topright", condsToCompare[2], cex=0.5) dev.off()