Single sample analysis
Pratibha Panwar, Boyi Guo, Haowen Zhou, Stephanie Hicks, Shila Ghazanfar
2024-09-09
Source:vignettes/v1_seqFISH_mouseEmbryo.Rmd
v1_seqFISH_mouseEmbryo.Rmd
SeqFISH mouse embryo data analysis
# load required packages
library(clustSIGNAL)
library(distances)
library(cluster)
library(aricode)
library(dplyr)
library(ggplot2)
library(patchwork)
library(scattermore)
data(mEmbryo2)
spe <- SpatialExperiment(assays = list(logcounts = me_expr),
colData = me_data, spatialCoordsNames = c("X", "Y"))
spe
## class: SpatialExperiment
## dim: 351 5000
## metadata(0):
## assays(1): logcounts
## rownames(351): Abcc4 Acp5 ... Zfp57 Zic3
## rowData names(0):
## colnames(5000): embryo2_Pos29_cell100_z2 embryo2_Pos29_cell101_z5 ...
## embryo2_Pos50_cell97_z5 embryo2_Pos50_cell99_z5
## colData names(4): uniqueID pos celltype_mapped_refined sample_id
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
## spatialCoords names(2) : X Y
## imgData names(0):
names(colData(spe))
## [1] "uniqueID" "pos"
## [3] "celltype_mapped_refined" "sample_id"
To run clustSIGNAL, we need the column names of sample and cell labels in the colData dataframe of the spatial experiment object. Here, the cell labels are in the column ‘uniqueID’ and sample labels are in ‘embryo’ column.
Running clustSIGNAL
set.seed(100)
samples <- "sample_id"
cells <- "uniqueID"
res_emb <- clustSIGNAL(spe, samples, cells, outputs = "a")
## [1] "Calculating PCA to use as reduced dimension input."
## [1] "clustSIGNAL run started. 2024-09-09 11:44:48.502439"
## [1] "Initial nonspatial clustering performed. Clusters = 11 2024-09-09 11:44:49.236507"
## [1] "Nonspatial subclustering performed. Subclusters = 52 2024-09-09 11:44:51.382995"
## [1] "Regions defined. 2024-09-09 11:44:53.476433"
## [1] "Region domainness calculated. 2024-09-09 11:44:54.358273"
## [1] "Smoothing performed. NN = 30 Kernel = G Spread = 0.05 2024-09-09 11:46:08.460835"
## [1] "Nonspatial clustering performed on smoothed data. Clusters = 14 2024-09-09 11:46:09.662027"
## [1] "clustSIGNAL run completed. 2024-09-09 11:46:09.66379"
The output variable is a list that can contain dataframe of cluster names, matrix of NN neighbours of each cell, final spe object, or a combination of these, depending on the choice of ‘outputs’ selected.
names(res_emb)
## [1] "clusters" "neighbours" "spe_final"
head(res_emb$clusters, n = 3)
## Cells Clusters
## 1 embryo2_Pos29_cell100_z2 10
## 2 embryo2_Pos29_cell101_z5 10
## 3 embryo2_Pos29_cell104_z2 10
spe <- res_emb$spe_final
spe
## class: SpatialExperiment
## dim: 351 5000
## metadata(0):
## assays(2): logcounts smoothed
## rownames(351): Abcc4 Acp5 ... Zfp57 Zic3
## rowData names(0):
## colnames(5000): embryo2_Pos29_cell100_z2 embryo2_Pos29_cell101_z5 ...
## embryo2_Pos50_cell97_z5 embryo2_Pos50_cell99_z5
## colData names(8): uniqueID pos ... entropy reCluster
## reducedDimNames(2): PCA PCA.smooth
## mainExpName: NULL
## altExpNames(0):
## spatialCoords names(2) : X Y
## imgData names(1): sample_id
Calculating clustering metrics
# calculating silhouette width
clusts <- as.numeric(as.character(spe$reCluster))
cXg_mat <- t(as.matrix(logcounts(spe)))
distMat <- distances(cXg_mat)
silCluster <- as.matrix(silhouette(clusts, distMat))
spe$rcSil <- silCluster[, 3]
# for datasets with annotated cell type information, we can also calculate
# metrics like adjusted rand index (ARI) and normalised mutual information (NMI)
as.data.frame(colData(spe)) %>%
summarise(ARI = aricode::ARI(celltype_mapped_refined, reCluster),
NMI = aricode::NMI(celltype_mapped_refined, reCluster),
ASW = mean(rcSil),
min_Entropy = min(entropy),
max_Entropy = max(entropy),
mean_Entropy = mean(entropy))
## ARI NMI ASW min_Entropy max_Entropy mean_Entropy
## 1 0.3668453 0.6469266 0.05075243 0 3.109686 1.420307
Visualising clustSIGNAL outputs
colors <- c("#635547", "#8EC792", "#9e6762", "#FACB12", "#3F84AA", "#0F4A9C",
"#ff891c", "#EF5A9D", "#C594BF", "#DFCDE4", "#139992", "#65A83E",
"#8DB5CE", "#005579", "#C9EBFB", "#B51D8D", "#532C8A", "#8870ad",
"#cc7818", "#FBBE92", "#EF4E22", "#f9decf", "#c9a997", "#C72228",
"#f79083", "#F397C0", "#DABE99", "#c19f70", "#354E23", "#C3C388",
"#647a4f", "#CDE088", "#f7f79e", "#F6BFCB", "#7F6874", "#989898",
"#1A1A1A", "#FFFFFF", "#e6e6e6", "#77441B", "#F90026", "#A10037",
"#DA5921", "#E1C239", "#9DD84A")
Entropy spread and distribution
# Histogram of entropy spread
hst_ent <- as.data.frame(colData(spe)) %>%
ggplot(aes(entropy)) +
geom_histogram(binwidth = 0.05) +
ggtitle("A") +
labs(x = "Entropy", y = "Number of regions") +
theme_grey() +
theme(text = element_text(size = 12))
# Spatial plot showing sample entropy distribution
spt_ent <- as.data.frame(colData(spe)) %>%
ggplot(aes(x = spatialCoords(spe)[, 1],
y = -spatialCoords(spe)[, 2])) +
geom_scattermore(pointsize = 3,
aes(colour = entropy)) +
scale_colour_gradient2("Entropy", low = "grey", high = "blue") +
scale_size_continuous(range = c(0, max(spe$entropy))) +
ggtitle("B") +
labs(x = "x-coordinate", y = "y-coordinate") +
theme_classic() +
theme(text = element_text(size = 12))
hst_ent + spt_ent
The spread (A) and spatial distribution (B) of region entropy measures can be very useful in assessing the tissue composition of samples - low entropy regions are more homogeneous with domain-like structure, whereas high entropy regions are heterogeneous with more uniform distribution of cells.
clustSIGNAL clusters visualisation
df_ent <- as.data.frame(colData(spe))
# spatial plot
spt_clust <- df_ent %>%
ggplot(aes(x = spatialCoords(spe)[, 1],
y = -spatialCoords(spe)[, 2])) +
geom_scattermore(pointsize = 3, aes(colour = reCluster)) +
scale_color_manual(values = colors) +
ggtitle("A") +
labs(x = "x-coordinate", y = "y-coordinate") +
guides(color = guide_legend(title = "Clusters",
override.aes = list(size = 3))) +
theme_classic() +
theme(text = element_text(size = 12))
# calculating median entropy of each cluster
celltype_ent <- df_ent %>%
group_by(as.character(reCluster)) %>%
summarise(meanEntropy = median(entropy))
# reordering clusters by their median entropy
# low to high median entropy
cellOrder <- celltype_ent$meanEntropy
names(cellOrder) <- celltype_ent$`as.character(reCluster)`
cellOrder <- sort(cellOrder)
df_ent$reCluster <- factor(df_ent$reCluster, levels = names(cellOrder))
# box plot of cluster entropy
colors_ent <- colors[as.numeric(names(cellOrder))]
box_clust <- df_ent %>%
ggplot(aes(x = reCluster, y = entropy, fill = reCluster)) +
geom_boxplot() +
scale_fill_manual(values = colors_ent) +
ggtitle("B") +
labs(x = "clustSIGNAL clusters", y = "Entropy") +
theme_classic() +
theme(legend.position = "none",
text = element_text(size = 12),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
spt_clust + box_clust + patchwork::plot_layout(guides = "collect", widths = c(2, 3))
The spatial location (A) and entropy distribution (B) of the clusters provide spatial context of the cluster cells and their neighbourhoods, as well as the compositions of the neighbouhoods.
Session Information
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] scattermore_1.2 patchwork_1.2.0
## [3] ggplot2_3.5.1 dplyr_1.1.4
## [5] aricode_1.0.3 cluster_2.1.6
## [7] distances_0.1.11 clustSIGNAL_0.1.0
## [9] SpatialExperiment_1.14.0 SingleCellExperiment_1.26.0
## [11] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [13] GenomicRanges_1.56.1 GenomeInfoDb_1.40.1
## [15] IRanges_2.38.1 S4Vectors_0.42.1
## [17] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
## [19] matrixStats_1.4.1 BiocStyle_2.32.1
##
## loaded via a namespace (and not attached):
## [1] gridExtra_2.3 rlang_1.1.4
## [3] magrittr_2.0.3 scater_1.32.1
## [5] compiler_4.4.1 DelayedMatrixStats_1.26.0
## [7] systemfonts_1.1.0 vctrs_0.6.5
## [9] pkgconfig_2.0.3 crayon_1.5.3
## [11] fastmap_1.2.0 magick_2.8.4
## [13] XVector_0.44.0 scuttle_1.14.0
## [15] labeling_0.4.3 utf8_1.2.4
## [17] rmarkdown_2.28 UCSC.utils_1.0.0
## [19] ggbeeswarm_0.7.2 ragg_1.3.2
## [21] xfun_0.47 bluster_1.14.0
## [23] zlibbioc_1.50.0 cachem_1.1.0
## [25] beachmat_2.20.0 jsonlite_1.8.8
## [27] highr_0.11 DelayedArray_0.30.1
## [29] BiocParallel_1.38.0 irlba_2.3.5.1
## [31] parallel_4.4.1 R6_2.5.1
## [33] bslib_0.8.0 jquerylib_0.1.4
## [35] Rcpp_1.0.13 bookdown_0.40
## [37] knitr_1.48 Matrix_1.7-0
## [39] igraph_2.0.3 tidyselect_1.2.1
## [41] abind_1.4-5 yaml_2.3.10
## [43] viridis_0.6.5 codetools_0.2-20
## [45] lattice_0.22-6 tibble_3.2.1
## [47] withr_3.0.1 evaluate_0.24.0
## [49] desc_1.4.3 pillar_1.9.0
## [51] BiocManager_1.30.25 generics_0.1.3
## [53] sparseMatrixStats_1.16.0 munsell_0.5.1
## [55] scales_1.3.0 glue_1.7.0
## [57] tools_4.4.1 BiocNeighbors_1.22.0
## [59] ScaledMatrix_1.12.0 fs_1.6.4
## [61] grid_4.4.1 colorspace_2.1-1
## [63] GenomeInfoDbData_1.2.12 beeswarm_0.4.0
## [65] BiocSingular_1.20.0 vipor_0.4.7
## [67] cli_3.6.3 rsvd_1.0.5
## [69] textshaping_0.4.0 fansi_1.0.6
## [71] S4Arrays_1.4.1 viridisLite_0.4.2
## [73] gtable_0.3.5 sass_0.4.9
## [75] digest_0.6.37 SparseArray_1.4.8
## [77] ggrepel_0.9.6 rjson_0.2.22
## [79] farver_2.1.2 htmltools_0.5.8.1
## [81] pkgdown_2.1.0 lifecycle_1.0.4
## [83] httr_1.4.7