Annotate cell types

Step 1. Load library and your seurat object

library(SingleR)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'sp'
## The following object is masked from 'package:IRanges':
## 
##     %over%
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:SummarizedExperiment':
## 
##     Assays
## The following object is masked from 'package:GenomicRanges':
## 
##     intersect
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following object is masked from 'package:IRanges':
## 
##     intersect
## The following object is masked from 'package:S4Vectors':
## 
##     intersect
## The following object is masked from 'package:BiocGenerics':
## 
##     intersect
## The following objects are masked from 'package:base':
## 
##     intersect, t
## 
## Attaching package: 'Seurat'
## The following object is masked from 'package:SummarizedExperiment':
## 
##     Assays
library(celldex)
## 
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
## 
##     BlueprintEncodeData, DatabaseImmuneCellExpressionData,
##     HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
##     MouseRNAseqData, NovershternHematopoieticData
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pheatmap)

#your seurat object from Firday
sobj <- readRDS("../bc_scRNAseq/output/bc_tutorial.rds") 

check your seurat object

sobj
## An object of class Seurat 
## 18054 features across 3498 samples within 1 assay 
## Active assay: RNA (18054 features, 1000 variable features)
##  3 layers present: counts, data, scale.data
##  2 dimensional reductions calculated: pca, umap

Step 2. Prepare inputs for running automatic cell type annotation

We need to extract the normalization data using GetAssayData()

sobj_count <- GetAssayData(sobj, layer = "data")
str(sobj_count)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:25868904] 0 1 6 7 8 9 10 13 14 16 ...
##   ..@ p       : int [1:3499] 0 8583 15839 25530 35501 44226 52914 62061 69292 79025 ...
##   ..@ Dim     : int [1:2] 18054 3498
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:18054] "SAMD11" "NOC2L" "KLHL17" "PLEKHN1" ...
##   .. ..$ : chr [1:3498] "AACAATGTGCTCCGAG-1" "AACACCATTCGCATAC-1" "AACACGTTGATACCGC-1" "AACACTCGTGAGCTTC-1" ...
##   ..@ x       : num [1:25868904] 0.727 1.664 0.538 3.008 0.305 ...
##   ..@ factors : list()

Get the Blueprint and Encode reference data from celldex package:

ref <- celldex::BlueprintEncodeData()
ref
## class: SummarizedExperiment 
## dim: 19859 259 
## metadata(0):
## assays(1): logcounts
## rownames(19859): TSPAN6 TNMD ... LINC00550 GIMAP1-GIMAP5
## rowData names(0):
## colnames(259): mature.neutrophil
##   CD14.positive..CD16.negative.classical.monocyte ...
##   epithelial.cell.of.umbilical.artery.1
##   dermis.lymphatic.vessel.endothelial.cell.1
## colData names(3): label.main label.fine label.ont

Step 3. Run SingleR to label cell type

pred <- SingleR(test=sobj_count, ref=ref, labels=ref$label.main)
pred %>% head()
## DataFrame with 6 rows and 4 columns
##                                            scores           labels delta.next
##                                          <matrix>      <character>  <numeric>
## AACAATGTGCTCCGAG-1 0.324793:0.418997:0.323649:... Epithelial cells 0.06219701
## AACACCATTCGCATAC-1 0.345131:0.400975:0.324331:... Epithelial cells 0.14348376
## AACACGTTGATACCGC-1 0.368779:0.458628:0.328615:...  Mesangial cells 0.12183469
## AACACTCGTGAGCTTC-1 0.376044:0.458448:0.340945:... Epithelial cells 0.00207031
## AACAGGAATTCTGTGA-1 0.403179:0.433582:0.298729:...  Mesangial cells 0.09215977
## AACAGGATGCTGGCAT-1 0.365389:0.400806:0.345930:... Epithelial cells 0.09739350
##                       pruned.labels
##                         <character>
## AACAATGTGCTCCGAG-1 Epithelial cells
## AACACCATTCGCATAC-1 Epithelial cells
## AACACGTTGATACCGC-1  Mesangial cells
## AACACTCGTGAGCTTC-1 Epithelial cells
## AACAGGAATTCTGTGA-1  Mesangial cells
## AACAGGATGCTGGCAT-1 Epithelial cells
unique(pred$labels)
## [1] "Epithelial cells" "Mesangial cells"  "Fibroblasts"      "Astrocytes"      
## [5] "Macrophages"      "Adipocytes"       "DC"

Next, we put the prediction results back to our seurat object for visualization

sobj$labels <- pred[match(rownames(sobj@meta.data), rownames(pred)), 'labels']
sobj@meta.data %>% head()
##                           orig.ident nCount_RNA nFeature_RNA percent.mt
## AACAATGTGCTCCGAG-1 Ovarian_Carcinoma      28045         8583   2.824033
## AACACCATTCGCATAC-1 Ovarian_Carcinoma      21851         7256   2.663494
## AACACGTTGATACCGC-1 Ovarian_Carcinoma      39421         9691   3.000939
## AACACTCGTGAGCTTC-1 Ovarian_Carcinoma      44116         9971   2.550095
## AACAGGAATTCTGTGA-1 Ovarian_Carcinoma      31401         8725   2.184644
## AACAGGATGCTGGCAT-1 Ovarian_Carcinoma      32380         8688   1.639901
##                    RNA_snn_res.0.5 seurat_clusters           labels
## AACAATGTGCTCCGAG-1               0               0 Epithelial cells
## AACACCATTCGCATAC-1               0               0 Epithelial cells
## AACACGTTGATACCGC-1               0               0  Mesangial cells
## AACACTCGTGAGCTTC-1               0               0 Epithelial cells
## AACAGGAATTCTGTGA-1               1               1  Mesangial cells
## AACAGGATGCTGGCAT-1               3               3 Epithelial cells

Step 4. Visualize the data

umap_pred <- DimPlot(
    sobj, reduction = 'umap', group.by = 'labels', label = TRUE)

umap_pred

Compare with your previous data

umap_sobj <- DimPlot(
    sobj, reduction = 'umap', group.by = 'seurat_clusters', label = TRUE)

umap_sobj | umap_pred

Plot the SingleR assignment score across all cell-label combination

plotScoreHeatmap(pred)

Step 5. Marker-based method: CellMarker v2

Load the human markers from CellMarker 2.0, file includes cell markers of different cell types from different tissue in human

human_marker <- read.csv("../bc_scRNAseq/data/CellMakerv2_Human.csv")
str(human_marker)
## 'data.frame':    60877 obs. of  20 variables:
##  $ species          : chr  "Human" "Human" "Human" "Human" ...
##  $ tissue_class     : chr  "Abdomen" "Abdomen" "Abdomen" "Abdomen" ...
##  $ tissue_type      : chr  "Abdomen" "Abdomen" "Abdomen" "Abdomen" ...
##  $ uberonongology_id: chr  "UBERON_0000916" "UBERON_0000916" "UBERON_0000916" "UBERON_0000916" ...
##  $ cancer_type      : chr  "Normal" "Normal" "Normal" "Normal" ...
##  $ cell_type        : chr  "Normal cell" "Normal cell" "Normal cell" "Normal cell" ...
##  $ cell_name        : chr  "Macrophage" "Macrophage" "Macrophage" "Macrophage" ...
##  $ cellontology_id  : chr  "CL_0000235" "CL_0000235" "CL_0000235" "CL_0000235" ...
##  $ marker           : chr  "MERTK" "CD16" "CD206" "CRIg" ...
##  $ Symbol           : chr  "MERTK" "FCGR3A" "MRC1" "VSIG4" ...
##  $ GeneID           : int  10461 2215 4360 11326 9332 2167 NA 7350 930 2210 ...
##  $ Genetype         : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
##  $ Genename         : chr  "MER proto-oncogene, tyrosine kinase" "Fc fragment of IgG receptor IIIb" "mannose receptor C-type 1" "V-set and immunoglobulin domain containing 4" ...
##  $ UNIPROTID        : chr  "Q12866" "O75015" "P22897" "Q9Y279" ...
##  $ technology_seq   : chr  "None" "None" "None" "None" ...
##  $ marker_source    : chr  "Experiment" "Experiment" "Experiment" "Experiment" ...
##  $ PMID             : int  31982413 31982413 31982413 31982413 31982413 32355218 32355218 32355218 NA NA ...
##  $ Title            : chr  "Peritoneal Level of CD206 Associates With Mortality and an Inflammatory Macrophage Phenotype in Patients With D"| __truncated__ "Peritoneal Level of CD206 Associates With Mortality and an Inflammatory Macrophage Phenotype in Patients With D"| __truncated__ "Peritoneal Level of CD206 Associates With Mortality and an Inflammatory Macrophage Phenotype in Patients With D"| __truncated__ "Peritoneal Level of CD206 Associates With Mortality and an Inflammatory Macrophage Phenotype in Patients With D"| __truncated__ ...
##  $ journal          : chr  "Gastroenterology" "Gastroenterology" "Gastroenterology" "Gastroenterology" ...
##  $ year             : int  2020 2020 2020 2020 2020 2020 2020 2020 NA NA ...

Select marker for ovary from the tissue class column

ovary_marker <- human_marker %>% 
    filter(species=="Human" & tissue_class=="Ovary") %>%
    select(c(3,5,6,7,8,9,16))

ovary_marker %>% head()
##     tissue_type cancer_type   cell_type        cell_name cellontology_id
## 1         Ovary      Normal Normal cell      Thecal cell            <NA>
## 2         Ovary      Normal Normal cell  Epithelial cell      CL_0000066
## 3 Corpus luteum      Normal Normal cell Endothelial cell      CL_0000115
## 4         Ovary      Normal Normal cell      Cancer cell      CL_0001064
## 5         Ovary      Normal Normal cell        Germ cell      CL_0000586
## 6         Ovary      Normal Normal cell        Germ cell      CL_0000586
##             marker marker_source
## 1 Aminopeptidase-N    Experiment
## 2            FOLR1    Experiment
## 3             CD34    Experiment
## 4              FAS    Experiment
## 5            MATER    Experiment
## 6            BMP15    Experiment

Get markers table from your seurat object

sobj.markers <- FindAllMarkers(sobj, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
sobj.markers %>% head()
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj cluster    gene
## BHLHE41 6.203160e-239  0.9382200 0.999 0.817 1.119919e-234       0 BHLHE41
## SLC35E1 1.098709e-190  0.8376728 0.999 0.817 1.983610e-186       0 SLC35E1
## FNTA    1.824515e-187  0.9101818 0.991 0.838 3.293980e-183       0    FNTA
## PAX2    3.317360e-186  1.5397125 0.914 0.394 5.989162e-182       0    PAX2
## PERP    5.841617e-183  0.9194699 0.997 0.796 1.054646e-178       0    PERP
## TUBB2B  7.196608e-179  1.2094813 0.968 0.624 1.299276e-174       0  TUBB2B

Combine CellMarker with our markers sobj.markers

merged_cellmarker <- sobj.markers %>% 
    filter(p_val_adj<0.05 & avg_log2FC>1 & pct.1 > 0.8 & pct.2 < 0.5) %>%
    left_join(ovary_marker[,c(3,4,6)], join_by(gene==marker), relationship = "many-to-many")

merged_cellmarker %>% head()
##           p_val avg_log2FC pct.1 pct.2     p_val_adj cluster   gene cell_type
## 1 3.317360e-186   1.539712 0.914 0.394 5.989162e-182       0   PAX2      <NA>
## 2 1.009558e-133   1.212609 0.841 0.379 1.822657e-129       0  TNNT1      <NA>
## 3 1.716705e-129   1.217553 0.869 0.423 3.099339e-125       0  MUC15      <NA>
## 4 6.979399e-129   1.098619 0.915 0.497 1.260061e-124       0  DPEP3      <NA>
## 5 3.019701e-125   1.327811 0.844 0.420 5.451768e-121       0    AK4      <NA>
## 6 3.639086e-118   1.319720 0.815 0.401 6.570005e-114       0 LRRTM1      <NA>
##   cell_name
## 1      <NA>
## 2      <NA>
## 3      <NA>
## 4      <NA>
## 5      <NA>
## 6      <NA>

Let’s try to see if there are fibroblast and astrocytes

merged_cellmarker %>%
    filter(cluster==2 & cell_type=="Cancer cell")
##           p_val avg_log2FC pct.1 pct.2     p_val_adj cluster   gene   cell_type
## 1 2.453948e-299   4.336628 0.976 0.399 4.430358e-295       2 PDGFRA Cancer cell
## 2 5.237721e-244   3.946544 0.936 0.483 9.456182e-240       2   CAV1 Cancer cell
## 3 6.911268e-239   3.360288 0.924 0.366 1.247760e-234       2  ACTA2 Cancer cell
## 4 6.911268e-239   3.360288 0.924 0.366 1.247760e-234       2  ACTA2 Cancer cell
##          cell_name
## 1       Fibroblast
## 2       Fibroblast
## 3 Mesenchymal cell
## 4       Fibroblast
    #select(c(7,8,9))

From the results, we found PDGFRA, CAV1, and ACTA2 pointing to Fibroblast in cancer cells

Check again using FeaturePlot()

genes <- c("PDGFRA", "CAV1", "ACTA2")
FeaturePlot(sobj, features = genes)

Step 6. Annotate the cells

rename cluster 2 to Fibroblasts using RenameIdents()

sobj_new <- RenameIdents(sobj, '2' = 'Fibroblasts')
DimPlot(sobj_new, reduction = 'umap', label = TRUE)

Step 7. Differentially expressed genes (Bonus)

Can we distinguish cluster 7?

merged_cellmarker %>% 
    filter(cluster==7 & cell_type=="Cancer cell") %>%
    dplyr::select(c(7,8,9)) %>%
    str()
## 'data.frame':    2 obs. of  3 variables:
##  $ gene     : chr  "ACTA2" "ACTA2"
##  $ cell_type: chr  "Cancer cell" "Cancer cell"
##  $ cell_name: chr  "Mesenchymal cell" "Fibroblast"
BiocManager::install("clusterProfiler")
## Bioconductor version 3.19 (BiocManager 1.30.23), R 4.4.0 (2024-04-24)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'clusterProfiler'
## Old packages: 'DBI', 'dqrng', 'highr', 'knitr', 'quantreg', 'RcppArmadillo',
##   'SparseM'
BiocManager::install("org.Hs.eg.db")
## Bioconductor version 3.19 (BiocManager 1.30.23), R 4.4.0 (2024-04-24)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'org.Hs.eg.db'
## Old packages: 'DBI', 'dqrng', 'highr', 'knitr', 'quantreg', 'RcppArmadillo',
##   'SparseM'
BiocManager::install("DOSE")
## Bioconductor version 3.19 (BiocManager 1.30.23), R 4.4.0 (2024-04-24)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'DOSE'
## Old packages: 'DBI', 'dqrng', 'highr', 'knitr', 'quantreg', 'RcppArmadillo',
##   'SparseM'
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
library(clusterProfiler)
## 
## clusterProfiler v4.12.0  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
library(DOSE)
## DOSE v3.30.1  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use DOSE in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
marker_7_2 <- FindMarkers(sobj, ident.1 = 7, ident.2 = 2)
marker_7_2_sig <- marker_7_2 %>% 
    filter(abs(avg_log2FC)>=1 & p_val_adj<0.05 & pct.2<0.4)
str(marker_7_2_sig)
## 'data.frame':    728 obs. of  5 variables:
##  $ p_val     : num  2.30e-91 2.52e-64 4.13e-56 5.60e-47 1.10e-45 ...
##  $ avg_log2FC: num  5.44 3.73 3.64 2.95 2.91 ...
##  $ pct.1     : num  0.918 0.762 0.714 0.748 0.755 0.673 0.844 0.537 0.789 0.592 ...
##  $ pct.2     : num  0.114 0.102 0.112 0.163 0.217 0.122 0.378 0.07 0.283 0.1 ...
##  $ p_val_adj : num  4.15e-87 4.54e-60 7.46e-52 1.01e-42 1.99e-41 ...
log2 <- marker_7_2_sig$avg_log2FC
names(log2) <- rownames(marker_7_2_sig)

log2_sort <- sort(log2, decreasing = TRUE)
log2_sort[1:5]
##      BLK     COMP    IGLC1    IGHA1    AGMAT 
## 5.505561 5.440098 4.727742 4.710259 4.306363

Run gene set enrichment analysis:

gse <- gseGO(geneList = log2_sort,
             ont="ALL",
             keyType="SYMBOL",
             nPerm=10000,
             OrgDb = "org.Hs.eg.db")
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in .GSEA(geneList = geneList, exponent = exponent, minGSSize =
## minGSSize, : We do not recommend using nPerm parameter incurrent and future
## releases
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize
## = minGSSize, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## leading edge analysis...
## done...

Visualize the results

dotplot(gse)

Put the identity back

sobj_new <- RenameIdents(sobj, 
                         '7' = 'Mesenchymal Cells',
                         '2' = 'Fibroblasts')
DimPlot(sobj_new, reduction = 'umap', label = TRUE)