annotation.Rmd

---
title: "Single cell RNA-seq of PBMCs from MITD - Cell type annotation"
output:
  pdf_document: default
  html_document:
    df_print: paged
  toc: TRUE
---

# step 6: Cell type annotation
```{r}
# using azimuth
pbmc <- RunAzimuth(pbmc, reference = "pbmcref")
```

```{r}
DimPlot(pbmc, group.by = "predicted.celltype.l2", label = TRUE, label.size = 3) + NoLegend()
DimPlot(pbmc, group.by = "seurat_clusters")
DimPlot(pbmc, group.by = "Donorid", label = F, label.size = 3) + NoLegend()
DimPlot(pbmc, group.by = "Group", label = F, label.size = 3) + NoLegend()

```

```{r}
pbmc <- NormalizeData(pbmc)
Idents(pbmc) <- "predicted.celltype.l2"

p1 <- FeaturePlot(pbmc, features = "CCR7")
p2 <- FeaturePlot(pbmc, features = "FCGR3A")
p3 <- VlnPlot(pbmc, features = "AXL", group.by = "predicted.celltype.l2", idents = c("ASDC",
    "pDC", "cDC1", "cDC2"))
p4 <- FeaturePlot(pbmc, features = "predictionscorecelltypel2_Treg")

p1 + p2 + p3 + p4 + plot_layout(ncol = 2)

```

```{r}

#QC coloured by clusters

# three clusters
table(pbmc$RNA_snn_res.0.9)
cluster.set <- unique(pbmc$RNA_snn_res.0.9)
# overall nFeature
sum(rowSums(pbmc[['RNA']]@counts) != 0)

# nFeature for each cluster
nF <- sapply(X = cluster.set, function(c) {
  cells.c <- WhichCells(object = pbmc, expression = RNA_snn_res.0.9 == c)
  #nFeature.c <- sum(rowSums(pbmc[['RNA']]@counts[, cells.c ]) != 0) / length(cells.c) # normalised by number of cells in a cluster
  df.cells <- pbmc@meta.data[which(rownames(pbmc@meta.data) %in% cells.c),]
  nFeature.c <- sum(df.cells$nFeature_RNA) / length(cells.c)
  return(nFeature.c)
}
)

#nC <- sapply(cluster.set, function(x) sum(x == pbmc$RNA_snn_res.0.9))

nC <- sapply(X = cluster.set, function(c) {
  cells.c <- WhichCells(object = pbmc, expression = RNA_snn_res.0.9 == c)
  df.cells <- pbmc@meta.data[which(rownames(pbmc@meta.data) %in% cells.c),]
  nCount.c <- sum(df.cells$nCount_RNA) / length(cells.c)
  return(nCount.c)
}
)

# mt <- sapply(X = cluster.set, function(c) {
#   cells.c <- WhichCells(object = pbmc, expression = RNA_snn_res.0.9 == c)
#   mt.df <- pbmc[colnames(pbmc)%in%cells.c,]
#   mt <- sum(pbmc$percent.mt[ cells.c ] != 0) / length(cells.c) # normalised by number of cells in a cluster
#   return(mt)
# }
# )

# First the cluster annotation and the tsne embeddings are merged
label.df <- cbind(as.data.frame(pbmc@meta.data), as.data.frame(pbmc@reductions$tsne@cell.embeddings)) 

# using dplyr across to calculate the mean mitochondrial percentage and 
# the median tsne values per cluster
label.df <- label.df %>% 
  dplyr::group_by(seurat_clusters) %>% 
  dplyr::summarise(dplyr::across(percent.mt, ~ mean(.), .names = "mean_{.col}"), 
                   dplyr::across(contains("tSNE"), ~ median(.)))

qc_df <- data.frame(seurat_clusters = cluster.set, nFeature_RNA = nF, nCount_RNA = nC) %>% 
  left_join(., label.df)

ggplot(qc_df, aes(x = nFeature_RNA, y = nCount_RNA, size = nCount_RNA, colour = mean_percent.mt)) + #, label = seurat_clusters
  geom_point(alpha = 0.7) +
  #geom_text(check_overlap = T, hjust = 0, nudge_x = 1.5, nudge_y = 1.5, colour = "black") +
  ggrepel::geom_text_repel(aes(label = seurat_clusters), size = 3) +
  theme_bw()
```

```{r}
celltype.set <- unique(pbmc$predicted.celltype.l2)
# overall nFeature
sum(rowSums(pbmc[['RNA']]@counts) != 0)

# nFeature for each cluster
nF <- sapply(X = celltype.set, function(c) {
  cells.c <- WhichCells(object = pbmc, expression = predicted.celltype.l2 == c)
  #nFeature.c <- sum(rowSums(pbmc[['RNA']]@counts[, cells.c ]) != 0) / length(cells.c) # normalised by number of cells in a cluster
  df.cells <- pbmc@meta.data[which(rownames(pbmc@meta.data) %in% cells.c),]
  nFeature.c <- sum(df.cells$nFeature_RNA) / length(cells.c)
  return(nFeature.c)
}
)

#nC <- sapply(cluster.set, function(x) sum(x == pbmc$RNA_snn_res.0.9))

nC <- sapply(X = celltype.set, function(c) {
  cells.c <- WhichCells(object = pbmc, expression = predicted.celltype.l2 == c)
  df.cells <- pbmc@meta.data[which(rownames(pbmc@meta.data) %in% cells.c),]
  nCount.c <- sum(df.cells$nCount_RNA) / length(cells.c)
  return(nCount.c)
}
)

# mt <- sapply(X = cluster.set, function(c) {
#   cells.c <- WhichCells(object = pbmc, expression = RNA_snn_res.0.9 == c)
#   mt.df <- pbmc[colnames(pbmc)%in%cells.c,]
#   mt <- sum(pbmc$percent.mt[ cells.c ] != 0) / length(cells.c) # normalised by number of cells in a cluster
#   return(mt)
# }
# )

# First the cluster annotation and the tsne embeddings are merged
label.df.ct <- cbind(as.data.frame(pbmc@meta.data), as.data.frame(pbmc@reductions$tsne@cell.embeddings)) 

# using dplyr across to calculate the mean mitochondrial percentage and 
# the median tsne values per cluster
label.df.ct <- label.df.ct %>% 
  dplyr::group_by(predicted.celltype.l2) %>% 
  dplyr::summarise(dplyr::across(percent.mt, ~ mean(.), .names = "mean_{.col}"), 
                   dplyr::across(contains("tSNE"), ~ median(.)))

qc_df.ct <- data.frame(predicted.celltype.l2 = celltype.set, nFeature_RNA = nF, nCount_RNA = nC) %>% 
  left_join(., label.df.ct)

ggplot(qc_df.ct, aes(x = nFeature_RNA, y = nCount_RNA, size = nCount_RNA, colour = mean_percent.mt)) + #, label = seurat_clusters
  geom_point(alpha = 0.7) +
  #geom_text(check_overlap = T, hjust = 0, nudge_x = 1.5, nudge_y = 1.5, colour = "black") +
  ggrepel::geom_text_repel(aes(label = predicted.celltype.l2), size = 2, max.overlap = 100) +
  theme_bw()


```

```{r}
saveRDS(pbmc, "pbmc_MITD1_l2_nF_500_6000_mt_20_res_0.9.rds")

```