library(methodical)
library(DESeq2)
library(TumourMethData)
#> Warning: replacing previous import 'HDF5Array::h5ls' by 'rhdf5::h5ls' when
#> loading 'TumourMethData'
methodical
facilitates the rapid analysis of the association between DNA
methylation and expression of associated transcripts. It can calculate
correlation values between individual methylation sites (e.g. CpG sites) or the
methylation of wider genomic regions (e.g. expected promoter regions, gene bodies)
and the expression of associated transcripts. We will demonstrate both cases in
this vignette.
# Installing Methodical
if (!require("BiocManager"))
install.packages("BiocManager")
BiocManager::install("methodical")
We now first download a RangedSummarizedExperiment with WGBS data for prostate tumour metastases and associated transcript counts from TumourMethData.
# Download mcrpc_wgbs_hg38 from TumourMethDatasets.
mcrpc_wgbs_hg38_chr11 <- TumourMethData::download_meth_dataset(dataset = "mcrpc_wgbs_hg38_chr11")
#> see ?TumourMethData and browseVignettes('TumourMethData') for documentation
#> loading from cache
#> require("rhdf5")
# Download transcript counts
mcrpc_transcript_counts_file <- TumourMethData::download_rnaseq_dataset(dataset = "mcrpc_wgbs_hg38_chr11")
#> see ?TumourMethData and browseVignettes('TumourMethData') for documentation
#> loading from cache
mcrpc_transcript_counts = data.frame(data.table::fread(mcrpc_transcript_counts_file), row.names = 1)
We now load annotation for our transcripts of interest, in this case all protein-coding transcripts annotated by Gencode.
# Download Gencode annotation
library(AnnotationHub)
#> Loading required package: BiocFileCache
#> Loading required package: dbplyr
#>
#> Attaching package: 'AnnotationHub'
#> The following object is masked from 'package:Biobase':
#>
#> cache
transcript_annotation <- AnnotationHub()[["AH75119"]]
#> loading from cache
#> require("rtracklayer")
# Import the Gencode annotation and subset for transcript annotation for non-mitochondrial protein-coding genes
transcript_annotation <- transcript_annotation[transcript_annotation$type == "transcript"]
transcript_annotation <- transcript_annotation[transcript_annotation$transcript_type == "protein_coding"]
# Remove transcript version from ID
transcript_annotation$ID <- gsub("\\.[0-9]*", "", transcript_annotation$ID)
# Filter for transcripts on chromosome 11
transcript_annotation_chr11 <- transcript_annotation[seqnames(transcript_annotation) == "chr11"]
# Filter for transcripts in the counts table
transcript_annotation_chr11 <- transcript_annotation_chr11[transcript_annotation_chr11$ID %in% row.names(mcrpc_transcript_counts)]
# Set transcript ID as names of ranges
names(transcript_annotation_chr11) <- transcript_annotation_chr11$ID
# Get the TSS for each transcript set ID as names
transcript_tss_chr11 <- resize(transcript_annotation_chr11, 1, fix = "start")
rm(transcript_annotation)
# Get the region surrounding each TSS
tss_proximal_regions_chr11 = methodical::expand_granges(transcript_tss_chr11, upstream = 5000, downstream = 5000)
Next we will normalize counts data for these transcripts from prostate cancer metastases samples using DESeq2.
# Subset mcrpc_transcript_counts for protein-coding transcripts
mcrpc_transcript_counts <- mcrpc_transcript_counts[transcript_tss_chr11$ID, ]
# Create a DESeqDataSet from Kallisto counts.
mcrpc_transcript_counts_dds <- DESeqDataSetFromMatrix(countData = mcrpc_transcript_counts,
colData = data.frame(sample = names(mcrpc_transcript_counts)), design = ~ 1)
mcrpc_transcript_counts_dds <- estimateSizeFactors(mcrpc_transcript_counts_dds)
mcrpc_transcript_counts_normalized <- data.frame(counts(mcrpc_transcript_counts_dds, normalized = TRUE))
We’ll first demonstrate calculating the correlation values between all transcripts
on chromosome 11 and the methylation of all CpG sites within 5 KB of their TSS
and also the significance of these correlation values. We do this with the
calculateMethSiteTranscriptCors
function, which takes methylation
RangedSummarizedExperiment and a table with transcript counts as input.
It also takes a GRanges object with the TSS of interest. The arguments
expand_upstream
and expand_downstream
define the regions around the TSS
where we want to examine CpG sites. We set these to 5KB upstream and downstream.
We can also provide a subset of samples that we want to use to calculate the
correlation values with the samples_subset
parameter.
The default behaviour is to use try to use all samples, but we use
common_mcprc_samples
since not all samples in mcrpc_wgbs_hg38_chr11
are
also present in mcrpc_transcript_counts_normalized
.
Finally, we can control the memory usage and parallelization with the
max_sites_per_chunk
and BPPARAM
parameters.
calculateMethSiteTranscriptCors
aims to keep the memory footprint low by
splitting the provided tss_gr
into chunks of closely-situated TSS and
processing these chunks one at a time. The chunks are limited to that they
contain an approximate maximum number of CpGs within expand_upstream
and
expand_downstream
of the constituent TSS.
max_sites_per_chunk
controls this upper limit of CpG that are read into memory
at once and has a default value of floor(62500000/ncol(meth_rse))
using
approximately 500 MB of RAM.
BPPARAM
controls the number of TSS within a chunk processed in parallel. As
the TSS belong to the same chunk, several cores can be used simultaneously
without drastically increasing RAM consumption. The following example
should just take a few minutes.
# Find samples with both WGBS and RNA-seq count data
common_mcprc_samples <- intersect(names(mcrpc_transcript_counts), colnames(mcrpc_wgbs_hg38_chr11))
# Create a BPPARAM class
BPPARAM = BiocParallel::bpparam()
# Calculate CpG methylation-transcription correlations for all TSS on chromosome 11
system.time({transcript_cpg_meth_cors_mcrpc_samples_5kb <- calculateMethSiteTranscriptCors(
meth_rse = mcrpc_wgbs_hg38_chr11,
transcript_expression_table = mcrpc_transcript_counts_normalized,
samples_subset = common_mcprc_samples,
tss_gr = transcript_tss_chr11, tss_associated_gr = tss_proximal_regions_chr11,
cor_method = "pearson", max_sites_per_chunk = NULL, BPPARAM = BPPARAM)})
#> There are 5553 genes/transcripts in common between tss_gr and transcript_expression_table
#> Using pearson correlation method
#> Calculating correlations for chunk 1 of 1
#> user system elapsed
#> 944.478 1394.991 575.648
As an example, we’ll examine the CpG methylation transcription correlations for a TSS of GSTP1, a gene known to be important to prostate cancer. We’ll use the TSS for the ENST00000398606 transcript, which is the ENSEMBL canonical and MANE select transcript for GSTP1.
We see that we have five columns in the results table: the name of the CpG site, the name of the associated transcript, the correlation value, the p-value of the correlation and the distance of the CpG site to the TSS of the transcript.
The location of the TSS associated with the results is stored as a GRanges as an
attribute called tss_range
, making it easy to retrieve this information.
# Extract correlation results for ENST00000398606 transcript
gstp1_cpg_meth_transcript_cors <- transcript_cpg_meth_cors_mcrpc_samples_5kb[["ENST00000398606"]]
# Examine first few rows of the results
head(gstp1_cpg_meth_transcript_cors)
#> meth_site transcript_name cor p_val distance_to_tss
#> 1 chr11:67578616 ENST00000398606 -0.079476178 0.4342305 -4979
#> 2 chr11:67578632 ENST00000398606 -0.006079833 0.9523740 -4963
#> 3 chr11:67578688 ENST00000398606 -0.088167827 0.3854989 -4907
#> 4 chr11:67578804 ENST00000398606 -0.115182741 0.2562572 -4791
#> 5 chr11:67578810 ENST00000398606 -0.060645790 0.5509724 -4785
#> 6 chr11:67578857 ENST00000398606 -0.013073295 0.8978078 -4738
# Extract TSS for the transcript
attributes(gstp1_cpg_meth_transcript_cors)$tss_range
#> GRanges object with 1 range and 23 metadata columns:
#> seqnames ranges strand | source type score
#> <Rle> <IRanges> <Rle> | <factor> <factor> <numeric>
#> ENST00000398606 chr11 67583595 + | HAVANA transcript NA
#> phase ID gene_id gene_type
#> <integer> <character> <character> <character>
#> ENST00000398606 <NA> ENST00000398606 ENSG00000084207.17 protein_coding
#> gene_name level hgnc_id havana_gene
#> <character> <character> <character> <character>
#> ENST00000398606 GSTP1 2 HGNC:4638 OTTHUMG00000137430.4
#> Parent transcript_id transcript_type
#> <CharacterList> <character> <character>
#> ENST00000398606 ENSG00000084207.17 ENST00000398606.9 protein_coding
#> transcript_name transcript_support_level
#> <character> <character>
#> ENST00000398606 GSTP1-202 1
#> tag havana_transcript
#> <CharacterList> <character>
#> ENST00000398606 basic,appris_principal_1,CCDS OTTHUMT00000268504.3
#> exon_number exon_id ont protein_id
#> <character> <character> <CharacterList> <character>
#> ENST00000398606 <NA> <NA> ENSP00000381607.3
#> ccdsid
#> <character>
#> ENST00000398606 CCDS41679.1
#> -------
#> seqinfo: 25 sequences from an unspecified genome; no seqlengths
Plotting the methylation-transcription correlation values around a TSS enable
us to visualize how the relationship between DNA methylation and transcription
changes around the TSS. We can use the plotMethSiteCorCoefs
function to
plot values for methylation sites, such as their correlation values or
methylation levels. It takes a data.frame as input which should have a column
called “meth_site” which gives the location of CpG sites as a character string
which can be coerced to a GRanges (e.g. “seqname:start”). We also provide the
name a numeric column which contains the values we want to plot, which is “cor”
here.
Values are displayed on the y-axis and the genomic location
on the x-axis. This location can be either the actual genomic location on the
relevant sequence or else the distance relative to the TSS, depending on
whether reference_tss
is set to TRUE or FALSE.
# Plot methylation-transcription correlation values for GSTP1 showing
# chromosome 11 position on the x-axis
meth_cor_plot_absolute_distance <- plotMethSiteCorCoefs(
meth_site_cor_values = gstp1_cpg_meth_transcript_cors,
reference_tss = FALSE,
ylabel = "Correlation Value",
title = "Correlation Between GSTP1 Transcription and CpG Methylation")
print(meth_cor_plot_absolute_distance)
# Plot methylation-transcription correlation values for GSTP1 showing
# distance to the GSTP1 on the x-axis
meth_cor_plot_relative_distance <- plotMethSiteCorCoefs(
meth_site_cor_values = gstp1_cpg_meth_transcript_cors,
reference_tss = TRUE,
ylabel = "Correlation Value",
title = "Correlation Between GSTP1 Transcription and CpG Methylation")
print(meth_cor_plot_relative_distance)
We see that there is a group of negatively correlated CpGs immediately surrounding the TSS and positively correlated CpGs further away, especially between 2,500 and 1,250 base pairs upstream.
The returned plots are ggplot objects and can be manipulated in any way that a regular ggplot object can.
# Add a dashed line to meth_cor_plot_relative_distance where the correlation is 0
meth_cor_plot_relative_distance + ggplot2::geom_hline(yintercept = 0, linetype = "dashed")
We’ll now demonstrate calculating the correlation values between transcript
expression and methylation of wider regions associated them instead of just
single CpG sites, e.g. promoters, gene bodies or enhancers. We use the
calculateRegionMethylationTranscriptCors
function for this, which takes a
RangedSummarizedExperiment with methylation values and a table with transcript
expression values, like calculateMethSiteTranscriptCors
. However,
calculateRegionMethylationTranscriptCors
takes a GRanges object with wider
regions which are associated with the transcripts. More than one region may be
associated with a given transcript, for example we could analyse the correlation
between transcript expression and methylation of its exons separately for each
exon.
Here we’ll investigate the relationship between gene body methylation and transcription for all transcripts on chromosome 11.
# Calculate gene body methylation-transcription correlations for all TSS on chromosome 11
system.time({transcript_body_meth_cors_mcrpc_samples <- calculateRegionMethylationTranscriptCors(
meth_rse = mcrpc_wgbs_hg38_chr11,
transcript_expression_table = mcrpc_transcript_counts_normalized,
samples_subset = common_mcprc_samples,
genomic_regions = transcript_annotation_chr11,
genomic_region_transcripts = transcript_annotation_chr11$ID,
region_methylation_summary_function = colMeans,
cor_method = "pearson", BPPARAM = BPPARAM)})
#> There are 5553 genes/transcripts in common between genomic_region_transcripts and transcript_expression_table
#> Using pearson correlation method
#> Calculating transcript correlations
#> user system elapsed
#> 98.956 12.768 37.497
# Show the top of the results table
head(transcript_body_meth_cors_mcrpc_samples)
#> genomic_region_name transcript_name cor p_val q_val
#> 1 ENST00000410108 ENST00000410108 0.04803169 0.6368465956 0.869748917
#> 2 ENST00000325147 ENST00000325147 0.07571628 0.4563504922 0.775170832
#> 3 ENST00000382762 ENST00000382762 -0.37077136 0.0001581844 0.004817179
#> 4 ENST00000529614 ENST00000529614 NA NA NA
#> 5 ENST00000332865 ENST00000332865 -0.01604665 0.8747368662 0.965361918
#> 6 ENST00000486280 ENST00000486280 -0.05726459 0.5734338611 0.842414046
# We'll filter for significant correlation values and plot their distribution
transcript_body_meth_cors_significant <-
dplyr::filter(transcript_body_meth_cors_mcrpc_samples, q_val < 0.05)
ggplot(transcript_body_meth_cors_significant, aes(x = cor)) +
geom_histogram() + theme_bw() +
scale_y_continuous(expand = expansion(mult = c(0, 0.2), add = 0)) +
labs(x = "Correlation Values", y = "Number of Significant Correlation Values")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Within minutes, we calculated all the correlation values for the methylation of CpG sites within 5KB of TSS on chromosome 11 and expression of the associated transcripts as well as the p-values for the correlations. We then analysed the relationship between mean gene body methylation and transcript expression, first calculating the mean methylation of CpGs overlapping gene bodies on chromosome 11 and then computing the correlation values, again in short time using just a few cores.
Thus, in under an hour we could easily analyse the correlations for all TSS in the genome on a desktop computer or even a laptop using just a few cores.
sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.20-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] rtracklayer_1.66.0 AnnotationHub_3.14.0
#> [3] BiocFileCache_2.14.0 dbplyr_2.5.0
#> [5] rhdf5_2.50.1 TumourMethData_1.4.0
#> [7] DESeq2_1.46.0 methodical_1.2.0
#> [9] SummarizedExperiment_1.36.0 Biobase_2.66.0
#> [11] MatrixGenerics_1.18.0 matrixStats_1.4.1
#> [13] ggplot2_3.5.1 GenomicRanges_1.58.0
#> [15] GenomeInfoDb_1.42.1 IRanges_2.40.1
#> [17] S4Vectors_0.44.0 BiocGenerics_0.52.0
#> [19] BiocStyle_2.34.0
#>
#> loaded via a namespace (and not attached):
#> [1] DBI_1.2.3 bitops_1.0-9 rlang_1.1.4
#> [4] magrittr_2.0.3 compiler_4.4.2 RSQLite_2.3.9
#> [7] png_0.1-8 vctrs_0.6.5 pkgconfig_2.0.3
#> [10] crayon_1.5.3 fastmap_1.2.0 magick_2.8.5
#> [13] XVector_0.46.0 labeling_0.4.3 Rsamtools_2.22.0
#> [16] rmarkdown_2.29 UCSC.utils_1.2.0 tinytex_0.54
#> [19] purrr_1.0.2 bit_4.5.0.1 xfun_0.49
#> [22] zlibbioc_1.52.0 cachem_1.1.0 jsonlite_1.8.9
#> [25] blob_1.2.4 rhdf5filters_1.18.0 DelayedArray_0.32.0
#> [28] Rhdf5lib_1.28.0 BiocParallel_1.40.0 parallel_4.4.2
#> [31] R6_2.5.1 bslib_0.8.0 jquerylib_0.1.4
#> [34] iterators_1.0.14 Rcpp_1.0.13-1 bookdown_0.41
#> [37] knitr_1.49 R.utils_2.12.3 Matrix_1.7-1
#> [40] tidyselect_1.2.1 abind_1.4-8 yaml_2.3.10
#> [43] codetools_0.2-20 curl_6.0.1 lattice_0.22-6
#> [46] tibble_3.2.1 withr_3.0.2 KEGGREST_1.46.0
#> [49] evaluate_1.0.1 ExperimentHub_2.14.0 Biostrings_2.74.1
#> [52] pillar_1.10.0 BiocManager_1.30.25 filelock_1.0.3
#> [55] foreach_1.5.2 generics_0.1.3 RCurl_1.98-1.16
#> [58] BiocVersion_3.20.0 munsell_0.5.1 scales_1.3.0
#> [61] glue_1.8.0 tools_4.4.2 BiocIO_1.16.0
#> [64] data.table_1.16.4 GenomicAlignments_1.42.0 locfit_1.5-9.10
#> [67] XML_3.99-0.17 grid_4.4.2 AnnotationDbi_1.68.0
#> [70] colorspace_2.1-1 GenomeInfoDbData_1.2.13 HDF5Array_1.34.0
#> [73] restfulr_0.0.15 cli_3.6.3 rappdirs_0.3.3
#> [76] S4Arrays_1.6.0 dplyr_1.1.4 gtable_0.3.6
#> [79] R.methodsS3_1.8.2 sass_0.4.9 digest_0.6.37
#> [82] SparseArray_1.6.0 farver_2.1.2 rjson_0.2.23
#> [85] memoise_2.0.1 htmltools_0.5.8.1 R.oo_1.27.0
#> [88] lifecycle_1.0.4 httr_1.4.7 mime_0.12
#> [91] bit64_4.5.2