Data is human ovarian carcinoma tissue (10x droplet scRNAseq)
library(Seurat)
library(dplyr)
# need install hdf5r, bioconductor
data <- Read10X_h5("data/human_ovarian_carcinoma_filter_matrix.h5")
sobj <- CreateSeuratObject(counts = data,
project = "Ovarian_Carcinoma",
min.cells = 3,
min.features = 200)
sobj
## An object of class Seurat
## 18054 features across 4670 samples within 1 assay
## Active assay: RNA (18054 features, 0 variable features)
## 1 layer present: counts
Let us see the structure of the seurat object
sobj@assays
## $RNA
## Assay (v5) data with 18054 features for 4670 cells
## First 10 features:
## SAMD11, NOC2L, KLHL17, PLEKHN1, PERM1, HES4, ISG15, AGRN, RNF223,
## C1orf159
## Layers:
## counts
# raw counts of cells
sobj@assays$RNA@layers$counts[1:10, 1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
##
## [1,] 3 3 5 13 12 4 14 5 2 9
## [2,] 12 5 6 6 9 5 8 11 7 15
## [3,] . 1 3 2 . 2 1 2 1 1
## [4,] . 2 . 2 2 . 1 . . 2
## [5,] . . . 1 . . . . . .
## [6,] . 1 2 2 4 4 4 3 1 .
## [7,] 2 20 . 7 4 17 61 7 1 81
## [8,] 54 27 40 45 33 34 32 24 15 39
## [9,] 1 . . . . 1 . . . 1
## [10,] 1 . . 1 4 1 . 1 1 1
# see the barcode of the data
sobj@assays$RNA@cells@.Data %>% head()
## counts
## AACAATGTGCTCCGAG-1 TRUE
## AACACCATTCGCATAC-1 TRUE
## AACACGTTGATACCGC-1 TRUE
## AACACTCGTGAGCTTC-1 TRUE
## AACAGCCTCCTGACTA-1 TRUE
## AACAGGAATTCTGTGA-1 TRUE
# see the feature (gene name)
sobj@assays$RNA@features@.Data %>% head()
## counts
## SAMD11 TRUE
## NOC2L TRUE
## KLHL17 TRUE
## PLEKHN1 TRUE
## PERM1 TRUE
## HES4 TRUE
sobj@meta.data %>% head()
## orig.ident nCount_RNA nFeature_RNA
## AACAATGTGCTCCGAG-1 Ovarian_Carcinoma 28045 8583
## AACACCATTCGCATAC-1 Ovarian_Carcinoma 21851 7256
## AACACGTTGATACCGC-1 Ovarian_Carcinoma 39421 9691
## AACACTCGTGAGCTTC-1 Ovarian_Carcinoma 44116 9971
## AACAGCCTCCTGACTA-1 Ovarian_Carcinoma 46131 10109
## AACAGGAATTCTGTGA-1 Ovarian_Carcinoma 31401 8725
Mitocindria DNA (mtDNA)
sobj[["percent.mt"]] <- PercentageFeatureSet(sobj, pattern = "^MT-")
sobj@meta.data %>% head()
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AACAATGTGCTCCGAG-1 Ovarian_Carcinoma 28045 8583 2.824033
## AACACCATTCGCATAC-1 Ovarian_Carcinoma 21851 7256 2.663494
## AACACGTTGATACCGC-1 Ovarian_Carcinoma 39421 9691 3.000939
## AACACTCGTGAGCTTC-1 Ovarian_Carcinoma 44116 9971 2.550095
## AACAGCCTCCTGACTA-1 Ovarian_Carcinoma 46131 10109 2.299972
## AACAGGAATTCTGTGA-1 Ovarian_Carcinoma 31401 8725 2.184644
# Visualize the results
VlnPlot(sobj,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
# filter data
sobj <- subset(
sobj,
subset = nFeature_RNA > 2500 & nFeature_RNA < 10000 & percent.mt < 4)
We use log normalization to normalize our data
sobj <- NormalizeData(sobj, normalization.method = "LogNormalize")
## Normalizing layer: counts
sobj@assays$RNA
## Assay (v5) data with 18054 features for 3498 cells
## First 10 features:
## SAMD11, NOC2L, KLHL17, PLEKHN1, PERM1, HES4, ISG15, AGRN, RNF223,
## C1orf159
## Layers:
## counts, data
sobj@assays$RNA@layers$data[1:5, 1:3]
## 5 x 3 sparse Matrix of class "dgCMatrix"
##
## [1,] 0.7274082 0.8641275 0.8190569
## [2,] 1.6637059 1.1903478 0.9250647
## [3,] . 0.3768221 0.5658907
## [4,] . 0.6498690 .
## [5,] . . .
sobj[["RNA"]]$data[1:5, 1:3]
## 5 x 3 sparse Matrix of class "dgCMatrix"
## AACAATGTGCTCCGAG-1 AACACCATTCGCATAC-1 AACACGTTGATACCGC-1
## SAMD11 0.7274082 0.8641275 0.8190569
## NOC2L 1.6637059 1.1903478 0.9250647
## KLHL17 . 0.3768221 0.5658907
## PLEKHN1 . 0.6498690 .
## PERM1 . . .
sobj <- FindVariableFeatures(sobj, nfeatures = 1000)
## Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- VariableFeatures(sobj) %>% head(10)
# plot variable features with and without labels
plot <- VariableFeaturePlot(sobj)
plot <- LabelPoints(plot = plot, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot
Here we first scale the data and then perform principal component analysis (PCA).
sobj <- ScaleData(sobj, vars.to.regress = "percent.mt")
## Regressing out percent.mt
## Centering and scaling data matrix
sobj
## An object of class Seurat
## 18054 features across 3498 samples within 1 assay
## Active assay: RNA (18054 features, 1000 variable features)
## 3 layers present: counts, data, scale.data
# visualize pca results in elbow plot
sobj <- RunPCA(sobj, features=VariableFeatures(object=sobj)) # can check sobj
## PC_ 1
## Positive: WFDC2, SLPI, MUC1, SLC34A2, EPCAM, BHLHE41, THEMIS2, SLC35E1, MUC16, S100A1
## MAL2, PERP, C19orf12, IFI27, CLDN4, AFDN, CCNE1, IFI6, CLDN3, EPPK1
## WNT7A, HIST1H1B, CRIP1, FAM32A, HIST1H1D, CXCL16, TPX2, POP4, DHCR24, KCTD1
## Negative: THBS1, IGFBP4, DCN, COL6A3, SPARC, CDH11, SERPINE2, COL6A2, LTBP2, HTRA1
## DPYSL3, COL1A1, TNXB, TAGLN, COL1A2, COL14A1, IGFBP7, CCN2, COL3A1, PDGFRA
## PRELP, COL6A1, FOS, COL16A1, CAVIN1, BGN, TIMP1, SMOC2, PTGIS, COL5A1
## PC_ 2
## Positive: TUBB2B, PAX2, SIX3, MYRF, BCAT1, TNNT1, APOA1, IMPG2, DPEP3, GPR27
## ADAM28, CTCFL, SPRY2, KLK6, CCR8, STXBP6, PLAT, SOX4, EDNRB, FOXA2
## LRIG1, TENM4, LRRC4, NRP2, FGF3, MST1, ADGRL2, OTUD1, CITED2, REN
## Negative: CD74, OAS2, STAT1, IGHG1, IFI44L, IGKC, CXCL10, C1QC, TRIM22, CMPK2
## IFIT3, SAMD9L, PSMB9, RSAD2, C1QB, IFIT1, EPSTI1, IFI44, XAF1, MX1
## APOL6, CCL5, GBP1, IFIT2, DDX58, CTSS, C1QA, TXNIP, OAS1, IFI16
## PC_ 3
## Positive: CADPS, APCDD1, IGKC, IGHG1, A2M, NKD2, VCAN, LGR5, HOPX, AQP1
## MZB1, JCHAIN, BCAT1, LRIG1, MME, IMPG2, NKD1, LXN, CD79A, CLEC14A
## GPR27, HIST1H1B, SAT1, VWF, RGS5, MGP, EGFL7, LUM, EPAS1, PECAM1
## Negative: VEGFA, S100A10, PFKP, HK2, RND3, DDIT4, GDF15, CIART, PFKFB4, P4HA1
## SLC2A1, NDRG1, SCD, ADM, SST, CRYAB, NDUFA4L2, TRIB3, AK4, LY6E
## PPFIA4, DTNA, NXPH4, ANKRD37, ALDOC, DDIT3, KDM3A, PXDC1, BHLHE40, PPP1R15A
## PC_ 4
## Positive: KRT7, MMP7, RNF144B, EBF3, ELF3, POU3F3, ZSWIM4, RCOR2, HIST1H1E, EHF
## ATF5, FRK, WNT11, TACSTD2, ITGB8, LIX1, WFDC2, COL9A1, FAM107A, FCGBP
## LTB, IL32, KRT5, SOX9, HMGA2, CDKN2B, TMCC3, SDC4, ASCL2, RAB25
## Negative: IFIT1, IFIT3, SAMHD1, IFIT2, RSAD2, IFI44L, CMPK2, OAS2, BCAT1, CXCL10
## MT-ND3, LYPD1, RNASE1, SP110, ADAM28, DDX60L, MX1, XAF1, IFI44, CXCL11
## OAS1, IFI16, LRRC4, DDX58, PLSCR1, PAX2, SAMD9, REN, NPAS2, HERC5
## PC_ 5
## Positive: LY6E, GMPR, SELENBP1, MX1, ISG15, IFI44L, ID4, IFI44, USP18, IRF7
## IFIT1, DTX3L, HELZ2, IFIT3, DDX58, HERC6, ESR1, DDX60, OAS1, PLSCR1
## PLAT, OAS3, IFI16, IFI35, APOA1, IFI27, HERC5, HSH2D, ANKRD65, MSLN
## Negative: GPNMB, ADM, LYZ, BHLHE40, RGS1, NDUFA4L2, DDIT4, MMP9, COMP, LAPTM5
## LSP1, VCAN, PFKFB3, LCP1, ITGAX, GDF15, CD68, CXCL8, FN1, PDK1
## HK2, STC1, STC2, HSPA6, CADPS, SRGN, TRIB3, CD74, SLC16A3, PLAUR
ElbowPlot(sobj)
Here we used UMAP to clustering the cells
sobj <- FindNeighbors(sobj, dims=1:14) # SNN: Shared Nearest Neighbor
## Computing nearest neighbor graph
## Computing SNN
sobj <- FindClusters(sobj, resolution=0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3498
## Number of edges: 114582
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8470
## Number of communities: 8
## Elapsed time: 0 seconds
# UMAP
sobj <- RunUMAP(sobj, dims=1:14) # check sobj
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:21:56 UMAP embedding parameters a = 0.9922 b = 1.112
## 14:21:56 Read 3498 rows and found 14 numeric columns
## 14:21:56 Using Annoy for neighbor search, n_neighbors = 30
## 14:21:56 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:21:57 Writing NN index file to temp file /var/folders/px/nbrccl_90mq_2wgr50w9qt6m0000gn/T//Rtmpi0XPle/file3e4c24f3b6c6
## 14:21:57 Searching Annoy index using 1 thread, search_k = 3000
## 14:21:57 Annoy recall = 100%
## 14:21:58 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 14:21:58 Initializing from normalized Laplacian + noise (using RSpectra)
## 14:21:58 Commencing optimization for 500 epochs, with 138510 positive edges
## 14:22:02 Optimization finished
DimPlot(sobj, reduction="umap")
A review of your seurat object. Now you have finished the first part of standard workflow of scRNAseq then you can save your seurat object for future usage.
saveRDS(sobj, file="output/bc_tutorial.rds")
# read Seurat rds object
sobj <- readRDS("output/bc_tutorial.rds")
# install presto for efficiency
# devtools::install_github("immunogenomics/presto")
# find all markers of cluster 2
cluster2.markers <- FindMarkers(sobj, ident.1 = 2)
head(cluster2.markers, n=5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## CDH23 0 5.598518 0.900 0.152 0
## OGN 0 5.785283 0.797 0.075 0
## PTGIS 0 4.813088 0.952 0.237 0
## SMOC2 0 4.453608 0.952 0.247 0
## GATA4 0 5.302072 0.829 0.136 0
VlnPlot(sobj, features = "SST")
Find all markers
sobj.markers <- FindAllMarkers(sobj, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
sobj.markers %>% head()
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## BHLHE41 6.203160e-239 0.9382200 0.999 0.817 1.119919e-234 0 BHLHE41
## SLC35E1 1.098709e-190 0.8376728 0.999 0.817 1.983610e-186 0 SLC35E1
## FNTA 1.824515e-187 0.9101818 0.991 0.838 3.293980e-183 0 FNTA
## PAX2 3.317360e-186 1.5397125 0.914 0.394 5.989162e-182 0 PAX2
## PERP 5.841617e-183 0.9194699 0.997 0.796 1.054646e-178 0 PERP
## TUBB2B 7.196608e-179 1.2094813 0.968 0.624 1.299276e-174 0 TUBB2B
str(sobj.markers)
## 'data.frame': 36706 obs. of 7 variables:
## $ p_val : num 6.20e-239 1.10e-190 1.82e-187 3.32e-186 5.84e-183 ...
## $ avg_log2FC: num 0.938 0.838 0.91 1.54 0.919 ...
## $ pct.1 : num 0.999 0.999 0.991 0.914 0.997 0.968 0.994 0.997 0.926 0.996 ...
## $ pct.2 : num 0.817 0.817 0.838 0.394 0.796 0.624 0.71 0.876 0.518 0.755 ...
## $ p_val_adj : num 1.12e-234 1.98e-186 3.29e-183 5.99e-182 1.05e-178 ...
## $ cluster : Factor w/ 8 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ gene : chr "BHLHE41" "SLC35E1" "FNTA" "PAX2" ...
unique(sobj.markers$cluster)
## [1] 0 1 2 3 4 5 6 7
## Levels: 0 1 2 3 4 5 6 7
sobj.markers %>%
group_by(cluster) %>%
dplyr::filter(avg_log2FC > 1) %>%
slice_head(n=3) -> top3
DoHeatmap(sobj, features = top3$gene) + NoLegend()
## Warning in DoHeatmap(sobj, features = top3$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: JUND,
## RBM38, GSTP1
FeaturePlot(sobj, features = c("PAX2", "CADPS", "CDH23", "KRT7",
"POP4", "MMP7", "COMP"))
cluster2.markers <- FindMarkers(sobj, ident.1 = 2,
ident.2 = 4,
logfc.threshold=0.25,
only.pos=TRUE)
head(cluster2.markers, n=5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## THBS1 1.839421e-122 6.402605 1 0.410 3.320891e-118
## COL6A3 2.370621e-121 4.600623 1 0.750 4.279919e-117
## DCN 2.373786e-121 4.857788 1 0.736 4.285633e-117
## LTBP2 2.721994e-121 4.806109 1 0.736 4.914288e-117
## COL6A2 2.903853e-121 4.222200 1 0.927 5.242616e-117
Color with cell markers
FeaturePlot(sobj, features = c("DHRS2", "TNXB", "OGN", "CDH23"))
cluster4.markers <- FindMarkers(sobj, ident.1 = 4,
logfc.threshold=1,
only.pos=TRUE)
head(cluster4.markers, n=5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## C19orf12 4.287216e-134 1.136787 1.000 0.869 7.740140e-130
## CCNE1 4.254087e-111 1.253870 1.000 0.828 7.680328e-107
## POP4 3.023810e-88 1.016761 1.000 0.788 5.459187e-84
## UQCRFS1 6.906204e-86 1.233646 0.993 0.711 1.246846e-81
## CRABP2 2.234680e-71 1.011644 1.000 0.834 4.034491e-67
FeaturePlot(sobj, features = c("C19orf12", "CCNE1",
"POP4", "CRABP2"))
VlnPlot(sobj, features = c("HIST1H4J", "ST6GALNAC5",
"HIST1H2AE", "IFITM1"))
Try to play with the parameters and find reliable markers. How to assign cell types?