Single sample analysis
Pratibha Panwar, Boyi Guo, Haowen Zhou, Stephanie Hicks, Shila Ghazanfar
2024-08-28
Source:vignettes/seqFISH_mouseEmbryo.Rmd
seqFISH_mouseEmbryo.Rmd
SeqFISH mouse embryo data analysis
# load required packages
library(clustSIGNAL)
library(distances)
library(cluster)
library(aricode)
library(dplyr)
library(ggplot2)
library(patchwork)
data(mEmbryo2)
spe = SpatialExperiment(assays = list(logcounts = me_expr),
colData = me_data, spatialCoordsNames = c("X", "Y"))
spe
#> class: SpatialExperiment
#> dim: 351 14185
#> metadata(0):
#> assays(1): logcounts
#> rownames(351): Abcc4 Acp5 ... Zfp57 Zic3
#> rowData names(0):
#> colnames(14185): embryo2_Pos29_cell10_z2 embryo2_Pos29_cell10_z5 ...
#> embryo2_Pos50_cell99_z2 embryo2_Pos50_cell99_z5
#> colData names(7): uniqueID pos ... nsCluster nsSubcluster
#> reducedDimNames(0):
#> mainExpName: NULL
#> altExpNames(0):
#> spatialCoords names(2) : X Y
#> imgData names(0):
names(colData(spe))
#> [1] "uniqueID" "pos"
#> [3] "celltype_mapped_refined" "sample_id"
#> [5] "entropy" "nsCluster"
#> [7] "nsSubcluster"
To run clustSIGNAL, we need the column names of sample and cell labels in the colData dataframe of the spatial experiment object. Here, the cell labels are in the column ‘uniqueID’ and sample labels are in ‘embryo’ column.
Running clustSIGNAL
set.seed(100)
samples = "sample_id"
cells = "uniqueID"
res_emb = clustSIGNAL(spe, samples, cells, outputs = "a")
#> [1] "Calculating PCA to use as reduced dimension input."
#> [1] "clustSIGNAL run started. 2024-08-28 03:46:56.505918"
#> [1] "Initial nonspatial clustering performed. Clusters = 16 2024-08-28 03:47:02.245294"
#> [1] "Nonspatial subclustering performed. Subclusters = 80 2024-08-28 03:47:07.286679"
#> [1] "Regions defined. 2024-08-28 03:47:13.02295"
#> [1] "Region domainness calculated. 2024-08-28 03:47:23.788145"
#> [1] "Smoothing performed. NN = 30 Kernel = G Spread = 0.05 2024-08-28 03:49:39.578434"
#> [1] "Nonspatial clustering performed on smoothed data. Clusters = 24 2024-08-28 03:49:45.080824"
#> [1] "clustSIGNAL run completed. 2024-08-28 03:49:45.082649"
The output variable is a list that can contain dataframe of cluster names, matrix of NN neighbours of each cell, final spe object, or a combination of these, depending on the choice of ‘outputs’ selected.
names(res_emb)
#> [1] "clusters" "neighbours" "spe_final"
head(res_emb$clusters, n = 3)
#> Cells Clusters
#> 1 embryo2_Pos29_cell10_z2 19
#> 2 embryo2_Pos29_cell10_z5 19
#> 3 embryo2_Pos29_cell100_z2 19
spe = res_emb$spe_final
spe
#> class: SpatialExperiment
#> dim: 351 14185
#> metadata(0):
#> assays(2): logcounts smoothed
#> rownames(351): Abcc4 Acp5 ... Zfp57 Zic3
#> rowData names(0):
#> colnames(14185): embryo2_Pos29_cell10_z2 embryo2_Pos29_cell10_z5 ...
#> embryo2_Pos50_cell99_z2 embryo2_Pos50_cell99_z5
#> colData names(8): uniqueID pos ... nsSubcluster reCluster
#> reducedDimNames(2): PCA PCA.smooth
#> mainExpName: NULL
#> altExpNames(0):
#> spatialCoords names(2) : X Y
#> imgData names(1): sample_id
Calculating clustering metrics
# calculating silhouette width
clusts <- as.numeric(as.character(spe$reCluster))
cXg_mat <- t(as.matrix(logcounts(spe)))
distMat <- distances(cXg_mat)
silCluster <- as.matrix(silhouette(clusts, distMat))
spe$rcSil <- silCluster[, 3]
# for datasets with annotated cell type information, we can also calculate
# metrics like adjusted rand index (ARI) and normalised mutual information (NMI)
as.data.frame(colData(spe)) %>%
summarise(ARI = aricode::ARI(celltype_mapped_refined, reCluster),
NMI = aricode::NMI(celltype_mapped_refined, reCluster),
ASW = mean(rcSil),
min_Entropy = min(entropy),
max_Entropy = max(entropy),
mean_Entropy = mean(entropy))
#> ARI NMI ASW min_Entropy max_Entropy mean_Entropy
#> 1 0.3302441 0.58197 0.03808628 0 2.906357 1.246809
Visualising clustSIGNAL outputs
colors = c("#635547", "#8EC792", "#9e6762", "#FACB12", "#3F84AA", "#0F4A9C",
"#ff891c", "#EF5A9D", "#C594BF", "#DFCDE4", "#139992", "#65A83E",
"#8DB5CE", "#005579", "#C9EBFB", "#B51D8D", "#532C8A", "#8870ad",
"#cc7818", "#FBBE92", "#EF4E22", "#f9decf", "#c9a997", "#C72228",
"#f79083", "#F397C0", "#DABE99", "#c19f70", "#354E23", "#C3C388",
"#647a4f", "#CDE088", "#f7f79e", "#F6BFCB", "#7F6874", "#989898",
"#1A1A1A", "#FFFFFF", "#e6e6e6", "#77441B", "#F90026", "#A10037",
"#DA5921", "#E1C239", "#9DD84A")
Entropy spread and distribution
# Histogram of entropy spread
hst_ent <- as.data.frame(colData(spe)) %>%
ggplot(aes(entropy)) +
geom_histogram(binwidth = 0.05) +
ggtitle("A") +
labs(x = "Entropy", y = "Number of regions") +
theme_grey() +
theme(text = element_text(size = 12))
# Spatial plot showing sample entropy distribution
spt_ent <- as.data.frame(colData(spe)) %>%
ggplot(aes(x = spatialCoords(spe)[, 1],
y = -spatialCoords(spe)[, 2])) +
geom_point(size = 0.5,
aes(colour = entropy)) +
scale_colour_gradient2("Entropy", low = "grey", high = "blue") +
scale_size_continuous(range = c(0, max(spe$entropy))) +
ggtitle("B") +
labs(x = "x-coordinate", y = "y-coordinate") +
theme_classic() +
theme(text = element_text(size = 12))
hst_ent + spt_ent
The spread (A) and spatial distribution (B) of region entropy measures can be very useful in assessing the tissue composition of samples - low entropy regions are more homogeneous with domain-like structure, whereas high entropy regions are heterogeneous with more uniform distribution of cells.
clustSIGNAL clusters visualisation
df_ent = as.data.frame(colData(spe))
# spatial plot
spt_clust <- df_ent %>%
ggplot(aes(x = spatialCoords(spe)[, 1],
y = -spatialCoords(spe)[, 2])) +
geom_point(size = 0.5, aes(colour = reCluster)) +
scale_color_manual(values = colors) +
ggtitle("A") +
labs(x = "x-coordinate", y = "y-coordinate") +
guides(color = guide_legend(title = "Clusters",
override.aes = list(size = 3))) +
theme_classic() +
theme(text = element_text(size = 12))
# calculating median entropy of each cluster
celltype_ent = df_ent %>%
group_by(as.character(reCluster)) %>%
summarise(meanEntropy = median(entropy))
# reordering clusters by their median entropy
# low to high median entropy
cellOrder = celltype_ent$meanEntropy
names(cellOrder) = celltype_ent$`as.character(reCluster)`
cellOrder = sort(cellOrder)
df_ent$reCluster = factor(df_ent$reCluster, levels = names(cellOrder))
# box plot of cluster entropy
colors_ent = colors[as.numeric(names(cellOrder))]
box_clust <- df_ent %>%
ggplot(aes(x = reCluster, y = entropy, fill = reCluster)) +
geom_boxplot() +
scale_fill_manual(values = colors_ent) +
ggtitle("B") +
labs(x = "clustSIGNAL clusters", y = "Entropy") +
theme_classic() +
theme(legend.position = "none",
text = element_text(size = 12),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
spt_clust + box_clust + patchwork::plot_layout(guides = "collect", widths = c(1, 2))
The spatial location (A) and entropy distribution (B) of the clusters provide spatial context of the cluster cells and their neighbourhoods, as well as the compositions of the neighbouhoods.
Session Information
sessionInfo()
#> R version 4.4.1 (2024-06-14)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
#>
#> locale:
#> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
#> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
#> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
#> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] parallel stats4 stats graphics grDevices utils datasets
#> [8] methods base
#>
#> other attached packages:
#> [1] patchwork_1.2.0 ggplot2_3.5.1
#> [3] dplyr_1.1.4 aricode_1.0.3
#> [5] cluster_2.1.6 distances_0.1.11
#> [7] clustSIGNAL_0.1.0 doParallel_1.0.17
#> [9] iterators_1.0.14 foreach_1.5.2
#> [11] SpatialExperiment_1.14.0 SingleCellExperiment_1.26.0
#> [13] SummarizedExperiment_1.34.0 Biobase_2.64.0
#> [15] GenomicRanges_1.56.1 GenomeInfoDb_1.40.1
#> [17] IRanges_2.38.1 S4Vectors_0.42.1
#> [19] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
#> [21] matrixStats_1.3.0 BiocStyle_2.32.1
#>
#> loaded via a namespace (and not attached):
#> [1] gridExtra_2.3 rlang_1.1.4
#> [3] magrittr_2.0.3 scater_1.32.1
#> [5] compiler_4.4.1 DelayedMatrixStats_1.26.0
#> [7] systemfonts_1.1.0 vctrs_0.6.5
#> [9] pkgconfig_2.0.3 crayon_1.5.3
#> [11] fastmap_1.2.0 magick_2.8.4
#> [13] XVector_0.44.0 labeling_0.4.3
#> [15] scuttle_1.14.0 utf8_1.2.4
#> [17] rmarkdown_2.28 UCSC.utils_1.0.0
#> [19] ggbeeswarm_0.7.2 ragg_1.3.2
#> [21] xfun_0.47 bluster_1.14.0
#> [23] zlibbioc_1.50.0 cachem_1.1.0
#> [25] beachmat_2.20.0 jsonlite_1.8.8
#> [27] highr_0.11 DelayedArray_0.30.1
#> [29] BiocParallel_1.38.0 irlba_2.3.5.1
#> [31] R6_2.5.1 bslib_0.8.0
#> [33] jquerylib_0.1.4 Rcpp_1.0.13
#> [35] bookdown_0.40 knitr_1.48
#> [37] igraph_2.0.3 Matrix_1.7-0
#> [39] tidyselect_1.2.1 abind_1.4-5
#> [41] yaml_2.3.10 viridis_0.6.5
#> [43] codetools_0.2-20 lattice_0.22-6
#> [45] tibble_3.2.1 withr_3.0.1
#> [47] evaluate_0.24.0 desc_1.4.3
#> [49] pillar_1.9.0 BiocManager_1.30.24
#> [51] generics_0.1.3 sparseMatrixStats_1.16.0
#> [53] munsell_0.5.1 scales_1.3.0
#> [55] glue_1.7.0 tools_4.4.1
#> [57] BiocNeighbors_1.22.0 ScaledMatrix_1.12.0
#> [59] fs_1.6.4 grid_4.4.1
#> [61] colorspace_2.1-1 GenomeInfoDbData_1.2.12
#> [63] beeswarm_0.4.0 BiocSingular_1.20.0
#> [65] vipor_0.4.7 cli_3.6.3
#> [67] rsvd_1.0.5 textshaping_0.4.0
#> [69] fansi_1.0.6 S4Arrays_1.4.1
#> [71] viridisLite_0.4.2 gtable_0.3.5
#> [73] sass_0.4.9 digest_0.6.37
#> [75] SparseArray_1.4.8 ggrepel_0.9.5
#> [77] farver_2.1.2 rjson_0.2.22
#> [79] htmltools_0.5.8.1 pkgdown_2.1.0
#> [81] lifecycle_1.0.4 httr_1.4.7