Skip to content

Latest commit

 

History

History
228 lines (198 loc) · 10.9 KB

2. DEG Analysis SOX15.md

File metadata and controls

228 lines (198 loc) · 10.9 KB

2. Project in Applied Computational Multi-Omics

MCMSc in Computational Biology and Bioinformatics, June 2024
Beatriz Infante (69348), Inês Pacheco (68826) and Maria Inês Gomes (68828)

Exploratory Analysis and Differential Expression Analysis

  1. Install packages from R and Bioconductor repositories. Set the R working directory to the folder containing Kallisto and load the file containing the EnsemblIDs and gene names
library(openxlsx)
library(gplots)
library(RColorBrewer)
library(tximport)
library(edgeR)
library(PCAtools)
library(rhdf5)
library(limma)
setwd("your/path/to/folder")
load('gencode.v38_geneInfo.RData')
head(geneInfo)
  1. Define filenames to import and import all data using tximport function
folders <- dir(pattern='_rep')
fileNames <- file.path('.', folders, 'abundance.tsv')
names(fileNames) <- folders
readCounts_SOX5 <- tximport(fileNames, type = 'kallisto', txOut = F, tx2gene = geneInfo[,1:2], ignoreTxVersion = T, countsFromAbundance = "lengthScaledTPM")
  1. Explore the object created with tximport
names(readCounts_SOX5)
head(readCounts_SOX5$abundance) # matrix containing TPMs, lines are genes, each column is a sample, and then read counts
head(readCounts_SOX5$counts) # matrix containing read counts
head(readCounts_SOX5$length) # gene length
head(readCounts_SOX5$countsFromAbundance) # indicates the type of normalization applied to get the values in "abundance"
  1. To facilitate interpretation of downstream results, replace EnsemblIDs with Official gene symbol (i.s gene name)
geneNames <- geneInfo[match(rownames(readCounts_SOX5$abundance), geneInfo$Ensembl_GeneID),"GeneSymbol"]
# replace the Ensembl ID by gene names
rownames(readCounts_SOX5$abundance) <- rownames(readCounts_SOX5$counts) <- geneNames
# replacing in both tables
head(readCounts_SOX5$abundance)
head(readCounts_SOX5$counts)

save(readCounts_SOX5, file="readCounts_SOX5.RData") # Save readCounts in an RData file for downstream analyses

Explore transcriptome profiles

# Prepare gene expression matrix for PCA analysis: 
# Step 1: get log TPMs (we add 1 TPM to each gene to avoid infinite values after log)
logTPMs <- log2(readCounts_SOX5$abundance + 1) #kallisto produces the TPMs (abundance)

boxplot(logTPMs)
dim(logTPMs) #60 593, probably different gene names, different Ensembl IDs but same gene names

# Prepare gene expression matrix for PCA analysis: 
# Step 2: remove duplicated genes
uniqueGenes <- unique(rownames(logTPMs)) #what are the unique names of the genes 
head(uniqueGenes)
logTPMs <- logTPMs[uniqueGenes,] 

# Prepare metadata with sample type
sampleTypes <- gsub("_rep[123]", "", colnames(logTPMs)) #sample types are just the first letters
# replacing _rep[123] by nothing, we just want the first letters
metaData <- data.frame(sampleTypes); rownames(metaData) <- colnames(logTPMs)
metaData 

dim(logTPMs) #59 374

Data visualization: Plots Optional: Plotting read counts and log-transformed read counts side by side

par(mfrow=c(1, 2)) # Set up the plotting area to display side-by-side plots
boxplot(readCounts_SOX5$counts, main="TPMs before Log Transformation", xlab="Samples", ylab="TPMs")
boxplot(logTPMs, main="Log-transformed TPMs", xlab="Samples", ylab="Log2(TPM + 1)")
  • PCA
par(mfrow=c(1, 1))
pca.res <- pca(logTPMs, metadata = metaData)
screeplot(pca.res) # Plot variance explained by each component

Now let's plot the selected components/eigenvectors

biplot(pca.res)
biplot(pca.res, colby = "sampleTypes", hline = 0, vline = 0, legendPosition = 'top') # Biplot with colors by sample type
biplot(pca.res, lab="", colby="sampleTypes", hline = 0, vline = 0,legendPosition = 'top') # Biplot without sample names

Plot the component loadings and label genes most responsible for variation.

Note: Loadings are interpreted as the coefficients of the linear combination of the initial variables from which the principal components are constructed.

From a statistical point of view, the loadings are equal to the coordinates of the variables divided by the square root of the eigenvalue associated with the component.

PC_genes <- pca.res$loadings
PC1_genes <- PC_genes[order(PC_genes$PC1, decreasing=T),]
head(PC1_genes)
tail(PC1_genes)
plotloadings(pca.res, components = c("PC1", "PC2"),rangeRetain =0.1) #retaining 1% of the loadings per PC

Produce barplot to confirm expression levels of genes associated with Principal Component 1.

plotCol <- rep(c("lightblue", "lightpink"), each=2)
# Set up a wider plotting area to fit the x-axis labels
par(mar=c(7, 4, 4, 2) + 0.1) # Increase bottom margin to fit labels

barplot(logTPMs["PPIAP22",], col = plotCol, las=2, main = "PPIAP22", ylab = "Expression levels (logTPMs)") #Gene positively correlated with PC1
legend("topleft", fill = unique(plotCol), legend = c("ctrl", "SOX15"), bty = "n")

barplot(logTPMs["RNASE7",], col = plotCol, las=2, main="RNASE7", ylab="Expression levels (logTPMs)") # Gene negatively correlated with PC1
legend("topright", fill=unique(plotCol), legend= c("ctrl", "SOX15"), bty="n")

Save some plot in a pdf file

pdf("PCA_plots.pdf") # open pdf file
biplot(pca.res, colby="sampleTypes", hline = 0, vline = 0,legendPosition = 'top') # Biplot with colors by sample type
plotloadings(pca.res, components = c("PC1", "PC2", "PC3"), rangeRetain =0.1) 
dev.off() # closes pdf file

Differential expression analysis using edge R package

First create DGEList data class (specific for edgeR package). Then, filter out lowly expressed genes and normalize for library sizes, Finally, define the design and contrast matrix based on the experimental design, meaning define which comparison to be made. See more detailed information here: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7873980/

y <- DGEList(counts = readCounts_SOX5$counts, group= sampleTypes)

keep <- filterByExpr(y, group=sampleTypes)
y <- y[keep, ,keep.lib.sizes=FALSE]
y <- calcNormFactors(y)
y$samples

design_matrix <- model.matrix(~0+sampleTypes)
colnames(design_matrix) <- gsub("sampleTypes", "", colnames(design_matrix))
rownames(design_matrix) <- colnames(logTPMs)
design_matrix
contrast_matrix <- makeContrasts(SOX15-ctrl, levels = design_matrix)
contrast_matrix

Estimate the dispersion (Biological coefficient of variation) and Fit model edgeR uses the negative binomial (NB) distribution to model the read counts for each gene in each sample. The dispersion parameter of the NB distribution accounts for variability between biological replicates (McCarthy, Chen, and Smyth 2012). edgeR estimates an empirical Bayes moderated dispersion for each individual gene. It also estimates a common dispersion, which is a global dispersion estimate averaged over all genes, and a trended dispersion where the dispersion of a gene is predicted from its abundance. The vertical axis of the plotBCV plot shows square-root dispersion, also known as biological coefficient of variation (BCV) (McCarthy, Chen, and Smyth 2012). For RNA-seq studies, the NB dispersions tend to be higher for genes with very low counts. The dispersion trend tends to decrease smoothly with abundance and to asymptotic to a constant value for genes with larger counts. From our past experience, the asymptotic value for the BCV tends to be in range from 0.05 to 0.2 for genetically identical mice or cell lines, whereas somewhat larger values (>0.3) are observed for human subjects.

y <- estimateDisp(y, design_matrix)
plotBCV(y)
fit <- glmQLFit(y,design_matrix) # Fit a quasi-likelihood negative binomial generalized log-linear model to count data.

Test for differential expression between the experimental groups using quasi-likelihood F-test

qlf <- glmQLFTest(fit, contrast =  contrast_matrix)

Get differentially expressed genes (DEGs)

summary(decideTests(qlf,p.value = 0.05,adjust.method = "fdr"))
DEGs <- topTags(qlf, nrow(qlf$table), p.value=0.05, adjust.method = "fdr")$table
upReg <- rownames(DEGs)[which(DEGs$logFC > 0)]
downReg <- rownames(DEGs)[which(DEGs$logFC < 0)]

Calculate ranks, remove double quotes from gene names, create a data frame with gene names and ranks and write the data frame to a tab-delimited text file with ".rnk" extension

ranks <- sign((qlf$table)$logFC) * -log10((qlf$table)$PValue)

gene_names <- gsub('"', '', rownames(qlf$table))

gene_rank_df <- data.frame(geneName = rownames(qlf$table), Rank = ranks)

write.table(gene_rank_df, file = "DEGs.rnk", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)

Extract gene names and write the gene names to a text file

DEGsgene_names <- rownames(DEGs)

writeLines(DEGsgene_names, "DEGs_gene_names.txt")
writeLines(upReg, "DEGs_upgene_names.txt")
writeLines(downReg, "DEGs_downgene_names.txt")

writeLines(rownames(y$counts), "all_gene_names.txt")
  • Volcano Plot
allGenes <- topTags(qlf, n = nrow(qlf$table), p.value = 1)$table
plotData <- cbind(allGenes$logFC, -log10(allGenes$FDR)); rownames(plotData) <- rownames(allGenes)
plot(plotData, pch=20,col="gray",xlab="Biological Variation (log2 Fold-Change)", ylab="Statistical Significance (-log10 P-Value)")
abline(h=-log10(0.05), v=c(-1,1),lty=2)
points(plotData[upReg,], col="tomato", pch=20)
points(plotData[downReg,], col="steelblue", pch=20)
text(plotData[upReg[1:5],], labels=upReg[1:5],col="tomato", pos=sample(c(1:3), size=10, replace=T), cex=0.8)
text(plotData[downReg[1:5],], labels=downReg[1:5],col="steelblue", pos=sample(c(1:2), size=10, replace=T), cex=0.8)
  • MA plot
plotData <- cbind(allGenes$logCPM, allGenes$logFC); rownames(plotData) <- rownames(allGenes)
plot(plotData, pch=20,col="gray",xlab="Mean Expression Levels (log CPMs)", ylab="Biological Variation (log2 Fold-Change)")
abline(h=c(-1,1),lty=2)
points(plotData[upReg,], col="tomato", pch=20)
points(plotData[downReg,], col="steelblue", pch=20)
text(plotData[upReg[1:5],], labels=upReg[1:5],col="tomato", pos=sample(c(1:3), size=10, replace=T), cex=0.8)
text(plotData[downReg[1:5],], labels=downReg[1:5],col="steelblue", pos=sample(c(1:2), size=10, replace=T), cex=0.8)
  • Heatmap with Top DEGs
# Define color palette for heatmap
plotCol_exp <- brewer.pal(9, "Greens")
# Increase margins to fit sample names
par(mar = c(10, 4, 4, 2) + 0.1)
# Create heatmap with adjustments for better sample name visibility
heatmap.2(logTPMs[c(upReg[1:10], downReg[1:10]),],
          scale = "row", 
          trace = "none", 
          density.info = "none", 
          ColSideColors = plotCol, 
          col = plotCol_exp,
          cexRow = 0.7, # Adjust the size of the row labels
          cexCol = 0.7, # Adjust the size of the column labels
          srtCol = 45,  # Rotate the column labels for better visibility
          margins = c(10, 10)) # Adjust the margins