Single Cell Analysis

Step 1 load data and library

Data is human ovarian carcinoma tissue (10x droplet scRNAseq)

library(Seurat)
library(dplyr)
# need install hdf5r, bioconductor
data <- Read10X_h5("data/human_ovarian_carcinoma_filter_matrix.h5")
sobj <- CreateSeuratObject(counts = data, 
                           project = "Ovarian_Carcinoma",
                           min.cells = 3,
                           min.features = 200)
sobj
## An object of class Seurat 
## 18054 features across 4670 samples within 1 assay 
## Active assay: RNA (18054 features, 0 variable features)
##  1 layer present: counts

Let us see the structure of the seurat object

sobj@assays
## $RNA
## Assay (v5) data with 18054 features for 4670 cells
## First 10 features:
##  SAMD11, NOC2L, KLHL17, PLEKHN1, PERM1, HES4, ISG15, AGRN, RNF223,
## C1orf159 
## Layers:
##  counts
# raw counts of cells
sobj@assays$RNA@layers$counts[1:10, 1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
##                                    
##  [1,]  3  3  5 13 12  4 14  5  2  9
##  [2,] 12  5  6  6  9  5  8 11  7 15
##  [3,]  .  1  3  2  .  2  1  2  1  1
##  [4,]  .  2  .  2  2  .  1  .  .  2
##  [5,]  .  .  .  1  .  .  .  .  .  .
##  [6,]  .  1  2  2  4  4  4  3  1  .
##  [7,]  2 20  .  7  4 17 61  7  1 81
##  [8,] 54 27 40 45 33 34 32 24 15 39
##  [9,]  1  .  .  .  .  1  .  .  .  1
## [10,]  1  .  .  1  4  1  .  1  1  1
# see the barcode of the data
sobj@assays$RNA@cells@.Data %>% head()
##                    counts
## AACAATGTGCTCCGAG-1   TRUE
## AACACCATTCGCATAC-1   TRUE
## AACACGTTGATACCGC-1   TRUE
## AACACTCGTGAGCTTC-1   TRUE
## AACAGCCTCCTGACTA-1   TRUE
## AACAGGAATTCTGTGA-1   TRUE
# see the feature (gene name)
sobj@assays$RNA@features@.Data %>% head()
##         counts
## SAMD11    TRUE
## NOC2L     TRUE
## KLHL17    TRUE
## PLEKHN1   TRUE
## PERM1     TRUE
## HES4      TRUE
sobj@meta.data %>% head()
##                           orig.ident nCount_RNA nFeature_RNA
## AACAATGTGCTCCGAG-1 Ovarian_Carcinoma      28045         8583
## AACACCATTCGCATAC-1 Ovarian_Carcinoma      21851         7256
## AACACGTTGATACCGC-1 Ovarian_Carcinoma      39421         9691
## AACACTCGTGAGCTTC-1 Ovarian_Carcinoma      44116         9971
## AACAGCCTCCTGACTA-1 Ovarian_Carcinoma      46131        10109
## AACAGGAATTCTGTGA-1 Ovarian_Carcinoma      31401         8725

Step 2 Preprocess and quality control

Mitocindria DNA (mtDNA)

sobj[["percent.mt"]] <- PercentageFeatureSet(sobj, pattern = "^MT-")
sobj@meta.data %>% head()
##                           orig.ident nCount_RNA nFeature_RNA percent.mt
## AACAATGTGCTCCGAG-1 Ovarian_Carcinoma      28045         8583   2.824033
## AACACCATTCGCATAC-1 Ovarian_Carcinoma      21851         7256   2.663494
## AACACGTTGATACCGC-1 Ovarian_Carcinoma      39421         9691   3.000939
## AACACTCGTGAGCTTC-1 Ovarian_Carcinoma      44116         9971   2.550095
## AACAGCCTCCTGACTA-1 Ovarian_Carcinoma      46131        10109   2.299972
## AACAGGAATTCTGTGA-1 Ovarian_Carcinoma      31401         8725   2.184644
# Visualize the results
VlnPlot(sobj, 
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

# filter data
sobj <- subset(
    sobj, 
    subset = nFeature_RNA > 2500 & nFeature_RNA < 10000 & percent.mt < 4)

Step 3. Normalize the data

We use log normalization to normalize our data

sobj <- NormalizeData(sobj, normalization.method = "LogNormalize")
## Normalizing layer: counts
sobj@assays$RNA
## Assay (v5) data with 18054 features for 3498 cells
## First 10 features:
##  SAMD11, NOC2L, KLHL17, PLEKHN1, PERM1, HES4, ISG15, AGRN, RNF223,
## C1orf159 
## Layers:
##  counts, data
sobj@assays$RNA@layers$data[1:5, 1:3]
## 5 x 3 sparse Matrix of class "dgCMatrix"
##                                   
## [1,] 0.7274082 0.8641275 0.8190569
## [2,] 1.6637059 1.1903478 0.9250647
## [3,] .         0.3768221 0.5658907
## [4,] .         0.6498690 .        
## [5,] .         .         .
sobj[["RNA"]]$data[1:5, 1:3]
## 5 x 3 sparse Matrix of class "dgCMatrix"
##         AACAATGTGCTCCGAG-1 AACACCATTCGCATAC-1 AACACGTTGATACCGC-1
## SAMD11           0.7274082          0.8641275          0.8190569
## NOC2L            1.6637059          1.1903478          0.9250647
## KLHL17           .                  0.3768221          0.5658907
## PLEKHN1          .                  0.6498690          .        
## PERM1            .                  .                  .

Step 4. Feature extraction

sobj <- FindVariableFeatures(sobj, nfeatures = 1000)
## Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- VariableFeatures(sobj) %>% head(10)

# plot variable features with and without labels
plot <- VariableFeaturePlot(sobj)
plot <- LabelPoints(plot = plot, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot

Step 5. Dimensional Reduction

Here we first scale the data and then perform principal component analysis (PCA).

sobj <- ScaleData(sobj, vars.to.regress = "percent.mt")
## Regressing out percent.mt
## Centering and scaling data matrix
sobj
## An object of class Seurat 
## 18054 features across 3498 samples within 1 assay 
## Active assay: RNA (18054 features, 1000 variable features)
##  3 layers present: counts, data, scale.data
# visualize pca results in elbow plot
sobj <- RunPCA(sobj, features=VariableFeatures(object=sobj)) # can check sobj
## PC_ 1 
## Positive:  WFDC2, SLPI, MUC1, SLC34A2, EPCAM, BHLHE41, THEMIS2, SLC35E1, MUC16, S100A1 
##     MAL2, PERP, C19orf12, IFI27, CLDN4, AFDN, CCNE1, IFI6, CLDN3, EPPK1 
##     WNT7A, HIST1H1B, CRIP1, FAM32A, HIST1H1D, CXCL16, TPX2, POP4, DHCR24, KCTD1 
## Negative:  THBS1, IGFBP4, DCN, COL6A3, SPARC, CDH11, SERPINE2, COL6A2, LTBP2, HTRA1 
##     DPYSL3, COL1A1, TNXB, TAGLN, COL1A2, COL14A1, IGFBP7, CCN2, COL3A1, PDGFRA 
##     PRELP, COL6A1, FOS, COL16A1, CAVIN1, BGN, TIMP1, SMOC2, PTGIS, COL5A1 
## PC_ 2 
## Positive:  TUBB2B, PAX2, SIX3, MYRF, BCAT1, TNNT1, APOA1, IMPG2, DPEP3, GPR27 
##     ADAM28, CTCFL, SPRY2, KLK6, CCR8, STXBP6, PLAT, SOX4, EDNRB, FOXA2 
##     LRIG1, TENM4, LRRC4, NRP2, FGF3, MST1, ADGRL2, OTUD1, CITED2, REN 
## Negative:  CD74, OAS2, STAT1, IGHG1, IFI44L, IGKC, CXCL10, C1QC, TRIM22, CMPK2 
##     IFIT3, SAMD9L, PSMB9, RSAD2, C1QB, IFIT1, EPSTI1, IFI44, XAF1, MX1 
##     APOL6, CCL5, GBP1, IFIT2, DDX58, CTSS, C1QA, TXNIP, OAS1, IFI16 
## PC_ 3 
## Positive:  CADPS, APCDD1, IGKC, IGHG1, A2M, NKD2, VCAN, LGR5, HOPX, AQP1 
##     MZB1, JCHAIN, BCAT1, LRIG1, MME, IMPG2, NKD1, LXN, CD79A, CLEC14A 
##     GPR27, HIST1H1B, SAT1, VWF, RGS5, MGP, EGFL7, LUM, EPAS1, PECAM1 
## Negative:  VEGFA, S100A10, PFKP, HK2, RND3, DDIT4, GDF15, CIART, PFKFB4, P4HA1 
##     SLC2A1, NDRG1, SCD, ADM, SST, CRYAB, NDUFA4L2, TRIB3, AK4, LY6E 
##     PPFIA4, DTNA, NXPH4, ANKRD37, ALDOC, DDIT3, KDM3A, PXDC1, BHLHE40, PPP1R15A 
## PC_ 4 
## Positive:  KRT7, MMP7, RNF144B, EBF3, ELF3, POU3F3, ZSWIM4, RCOR2, HIST1H1E, EHF 
##     ATF5, FRK, WNT11, TACSTD2, ITGB8, LIX1, WFDC2, COL9A1, FAM107A, FCGBP 
##     LTB, IL32, KRT5, SOX9, HMGA2, CDKN2B, TMCC3, SDC4, ASCL2, RAB25 
## Negative:  IFIT1, IFIT3, SAMHD1, IFIT2, RSAD2, IFI44L, CMPK2, OAS2, BCAT1, CXCL10 
##     MT-ND3, LYPD1, RNASE1, SP110, ADAM28, DDX60L, MX1, XAF1, IFI44, CXCL11 
##     OAS1, IFI16, LRRC4, DDX58, PLSCR1, PAX2, SAMD9, REN, NPAS2, HERC5 
## PC_ 5 
## Positive:  LY6E, GMPR, SELENBP1, MX1, ISG15, IFI44L, ID4, IFI44, USP18, IRF7 
##     IFIT1, DTX3L, HELZ2, IFIT3, DDX58, HERC6, ESR1, DDX60, OAS1, PLSCR1 
##     PLAT, OAS3, IFI16, IFI35, APOA1, IFI27, HERC5, HSH2D, ANKRD65, MSLN 
## Negative:  GPNMB, ADM, LYZ, BHLHE40, RGS1, NDUFA4L2, DDIT4, MMP9, COMP, LAPTM5 
##     LSP1, VCAN, PFKFB3, LCP1, ITGAX, GDF15, CD68, CXCL8, FN1, PDK1 
##     HK2, STC1, STC2, HSPA6, CADPS, SRGN, TRIB3, CD74, SLC16A3, PLAUR
ElbowPlot(sobj)

Step 6. Clustering the cells

Here we used UMAP to clustering the cells

sobj <- FindNeighbors(sobj, dims=1:14) # SNN: Shared Nearest Neighbor
## Computing nearest neighbor graph
## Computing SNN
sobj <- FindClusters(sobj, resolution=0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3498
## Number of edges: 114582
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8470
## Number of communities: 8
## Elapsed time: 0 seconds
# UMAP
sobj <- RunUMAP(sobj, dims=1:14) # check sobj
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:21:56 UMAP embedding parameters a = 0.9922 b = 1.112
## 14:21:56 Read 3498 rows and found 14 numeric columns
## 14:21:56 Using Annoy for neighbor search, n_neighbors = 30
## 14:21:56 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:21:57 Writing NN index file to temp file /var/folders/px/nbrccl_90mq_2wgr50w9qt6m0000gn/T//Rtmpi0XPle/file3e4c24f3b6c6
## 14:21:57 Searching Annoy index using 1 thread, search_k = 3000
## 14:21:57 Annoy recall = 100%
## 14:21:58 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:21:58 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:21:58 Commencing optimization for 500 epochs, with 138510 positive edges
## 14:22:02 Optimization finished
DimPlot(sobj, reduction="umap")

Step 7. save your seurat object as a rds file

A review of your seurat object. Now you have finished the first part of standard workflow of scRNAseq then you can save your seurat object for future usage.

saveRDS(sobj, file="output/bc_tutorial.rds")

# read Seurat rds object
sobj <- readRDS("output/bc_tutorial.rds")

Step 8. Selecting cell markers

# install presto for efficiency
# devtools::install_github("immunogenomics/presto")

# find all markers of cluster 2
cluster2.markers <- FindMarkers(sobj, ident.1 = 2)
head(cluster2.markers, n=5)
##       p_val avg_log2FC pct.1 pct.2 p_val_adj
## CDH23     0   5.598518 0.900 0.152         0
## OGN       0   5.785283 0.797 0.075         0
## PTGIS     0   4.813088 0.952 0.237         0
## SMOC2     0   4.453608 0.952 0.247         0
## GATA4     0   5.302072 0.829 0.136         0
VlnPlot(sobj, features = "SST")

Find all markers

sobj.markers <- FindAllMarkers(sobj, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
sobj.markers %>% head()
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj cluster    gene
## BHLHE41 6.203160e-239  0.9382200 0.999 0.817 1.119919e-234       0 BHLHE41
## SLC35E1 1.098709e-190  0.8376728 0.999 0.817 1.983610e-186       0 SLC35E1
## FNTA    1.824515e-187  0.9101818 0.991 0.838 3.293980e-183       0    FNTA
## PAX2    3.317360e-186  1.5397125 0.914 0.394 5.989162e-182       0    PAX2
## PERP    5.841617e-183  0.9194699 0.997 0.796 1.054646e-178       0    PERP
## TUBB2B  7.196608e-179  1.2094813 0.968 0.624 1.299276e-174       0  TUBB2B
str(sobj.markers)
## 'data.frame':    36706 obs. of  7 variables:
##  $ p_val     : num  6.20e-239 1.10e-190 1.82e-187 3.32e-186 5.84e-183 ...
##  $ avg_log2FC: num  0.938 0.838 0.91 1.54 0.919 ...
##  $ pct.1     : num  0.999 0.999 0.991 0.914 0.997 0.968 0.994 0.997 0.926 0.996 ...
##  $ pct.2     : num  0.817 0.817 0.838 0.394 0.796 0.624 0.71 0.876 0.518 0.755 ...
##  $ p_val_adj : num  1.12e-234 1.98e-186 3.29e-183 5.99e-182 1.05e-178 ...
##  $ cluster   : Factor w/ 8 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ gene      : chr  "BHLHE41" "SLC35E1" "FNTA" "PAX2" ...
unique(sobj.markers$cluster)
## [1] 0 1 2 3 4 5 6 7
## Levels: 0 1 2 3 4 5 6 7
sobj.markers %>%
    group_by(cluster) %>%
    dplyr::filter(avg_log2FC > 1) %>%
    slice_head(n=3) -> top3

DoHeatmap(sobj, features = top3$gene) + NoLegend()
## Warning in DoHeatmap(sobj, features = top3$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: JUND,
## RBM38, GSTP1

FeaturePlot(sobj, features = c("PAX2", "CADPS", "CDH23", "KRT7",
                               "POP4", "MMP7", "COMP"))

cluster2.markers <- FindMarkers(sobj, ident.1 = 2,
                                ident.2 = 4, 
                                logfc.threshold=0.25,
                                only.pos=TRUE)
head(cluster2.markers, n=5)
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## THBS1  1.839421e-122   6.402605     1 0.410 3.320891e-118
## COL6A3 2.370621e-121   4.600623     1 0.750 4.279919e-117
## DCN    2.373786e-121   4.857788     1 0.736 4.285633e-117
## LTBP2  2.721994e-121   4.806109     1 0.736 4.914288e-117
## COL6A2 2.903853e-121   4.222200     1 0.927 5.242616e-117

Color with cell markers

FeaturePlot(sobj, features = c("DHRS2", "TNXB", "OGN", "CDH23"))

cluster4.markers <- FindMarkers(sobj, ident.1 = 4,
                                logfc.threshold=1, 
                                only.pos=TRUE)
head(cluster4.markers, n=5)
##                  p_val avg_log2FC pct.1 pct.2     p_val_adj
## C19orf12 4.287216e-134   1.136787 1.000 0.869 7.740140e-130
## CCNE1    4.254087e-111   1.253870 1.000 0.828 7.680328e-107
## POP4      3.023810e-88   1.016761 1.000 0.788  5.459187e-84
## UQCRFS1   6.906204e-86   1.233646 0.993 0.711  1.246846e-81
## CRABP2    2.234680e-71   1.011644 1.000 0.834  4.034491e-67
FeaturePlot(sobj, features = c("C19orf12", "CCNE1",
                               "POP4", "CRABP2"))

VlnPlot(sobj, features = c("HIST1H4J", "ST6GALNAC5",
                               "HIST1H2AE", "IFITM1"))

Try to play with the parameters and find reliable markers. How to assign cell types?