EndPoints CTL04: Metabolism Gene Signatures

Checking gene signatures on CTL04

1. Environment Set Up

Values of RMarkdown parameters

for (i in 1:length(params))
  print(paste('Parameter:', names(params)[i], ' - Value:', params[[i]], '- Class:', class(params[[i]])))
## [1] "Parameter: Dataset  - Value: CTL04 - Class: character"
## [1] "Parameter: SE  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/CTL04/Output/FemaleLineSE_Bio.rds - Class: character"
## [1] "Parameter: OutputFolder  - Value: ~/DataDir/bulkRNASeq/4.GeneSignatures/CTL04/Output/MetabolismSig/ - Class: character"
## [1] "Parameter: CpmFilt  - Value: 2 - Class: integer"
## [1] "Parameter: SampleFilt  - Value: 4 - Class: integer"

library(RNASeqBulkExploratory)  #Our Package
library(DT)
library(gridExtra)
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(SEtools)
## Loading required package: sechm
library(sechm)
library(randomcoloR)
library(ggplot2)
library(tidyHeatmap)
## ========================================
## tidyHeatmap version 1.8.1
## If you use tidyHeatmap in published research, please cite:
## 1) Mangiola et al. tidyHeatmap: an R package for modular heatmap production 
##   based on tidy principles. JOSS 2020.
## 2) Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(tidyHeatmap))
## ========================================
## 
## Attaching package: 'tidyHeatmap'
## The following object is masked from 'package:stats':
## 
##     heatmap

Dataset <- params$Dataset
OutputFolder <- params$OutputFolder

if (dir.exists(OutputFolder) == FALSE) {
  dir.create(OutputFolder, recursive=FALSE)
}

2. Data Upload

2.1 SE object

SE object created from exploratory analysis. Steps already performed: gene selection according to biotype.

SE_Bio <- readRDS(params$SE)

SE object containing information about 19900 genes in 66 samples.

if(! identical(names(assays(SE_Bio)$counts), rownames(colData(SE_Bio)))){
  stop('Inconsistencies in sample metadata for SE object')
  }

if(! identical(names(assays(SE_Bio)$counts), colData(SE_Bio)$HRID)){
  stop('Inconsistencies in sample metadata for SE object')
  }

2.2 Gene Lenght

GeneLength <- read.table(file = '~/DataDir/bulkRNASeq/1.GeneLength/Output/GenecodeV35GeneLength.txt', 
                         sep='\t', header=TRUE)

SelLenght <- dplyr::left_join(as.data.frame(rowData(SE_Bio))[,1:3], GeneLength, by=c('Gene'='Gene'))

if(identical(SelLenght$Gene, rowData(SE_Bio)$Gene)){
  rowData(SE_Bio)$GeneLength <- SelLenght$Length
  print('Gene Lengh information added to SE')
} else {
    print('Information on gene lenght is not compatible with SE object. Gene Lenght information not added to SE')
  }
## [1] "Gene Lengh information added to SE"

2.3 Gene Signatures

GS <- read.table(file = '~/DataDir/bulkRNASeq/4.GeneSignatures/GeneSets/MetabolismGenes.txt', 
                 sep='\t', header=TRUE, row.names = NULL)
names(GS)[3] <- 'Population'

GS %>%  
  data.frame() %>%
  DT::datatable()

3. Selecting and filtering

3.1 Sample Selection

Not necessary.

3.2 Gene Filtering

3.2.1 Setting gene filtering thresholds

CpmFilt <- params$CpmFilt
if(is.numeric(CpmFilt)==FALSE){
  stop('ERROR! Cpm threshold for gene filtering has a non-numeric value')
}

SampleFilt <- ifelse(params$SampleFilt=='Default', dim(SE_Bio)[2]/2, params$SampleFilt)
if(is.numeric(SampleFilt)==FALSE){
  stop('ERROR! Sample threshold for gene filtering has a non-numeric value')
}

Genes having an expression lower than 2 in at least 4 samples are filtered out.

3.2.2 Filter out low expressed genes

Compute temporary cpm (without TMM normalization, not stored in SE)
Filter out lowly-expressed genes

SE_Bio$lib.size
##        MTR_044_S1_CTL        MTR_045_S2_CTL        MTR_046_S3_CTL 
##              35896092              31708728              30084298 
##        MTR_047_S4_CTL      MTR_048_S65_DMSO       MTR_049_S5_DMSO 
##              32298211              43316736              33355586 
##       MTR_050_S6_DMSO      MTR_051_S66_DMSO   MTR_052_S7_AhHyd_Ag 
##              30026932              36026174              34905456 
##   MTR_053_S8_AhHyd_Ag   MTR_054_S9_AhHyd_Ag  MTR_055_S10_AhHyd_Ag 
##              39115824              35254183              36508256 
## MTR_056_S11_AhHyd_Inh MTR_057_S12_AhHyd_Inh MTR_058_S13_AhHyd_Inh 
##              32446046              39181238              34663693 
## MTR_059_S14_AhHyd_Inh    MTR_060_S15_Ret_Ag    MTR_061_S16_Ret_Ag 
##              38124739              45105841              37421078 
##    MTR_062_S17_Ret_Ag    MTR_063_S18_Ret_Ag   MTR_064_S19_Ret_Inh 
##              32185720              35203919              35563706 
##   MTR_065_S20_Ret_Inh   MTR_066_S21_Ret_Inh   MTR_067_S22_Ret_Inh 
##              42993502              41486813              41593609 
##   MTR_068_S23_Andr_Ag   MTR_069_S24_Andr_Ag   MTR_070_S25_Andr_Ag 
##              32815429              40906632              31208496 
##   MTR_071_S26_Andr_Ag  MTR_072_S27_Andr_Inh  MTR_073_S28_Andr_Inh 
##              40030676              33206709              36578304 
##  MTR_074_S29_Andr_Inh  MTR_075_S30_Andr_Inh   MTR_076_S31_LivX_Ag 
##              33775034              30169516              40306389 
##   MTR_077_S32_LivX_Ag   MTR_078_S33_LivX_Ag   MTR_079_S34_LivX_Ag 
##              55537385              38873337              44661327 
##  MTR_080_S35_LivX_Inh  MTR_081_S36_LivX_Inh  MTR_082_S37_LivX_Inh 
##              36255300              43640070              33774429 
##  MTR_083_S38_LivX_Inh     MTR_084_S39_GC_Ag     MTR_085_S40_GC_Ag 
##              31448744              49615558              39058430 
##     MTR_086_S41_GC_Ag     MTR_087_S42_GC_Ag    MTR_088_S43_GC_Inh 
##              40949271              45343061              37632428 
##    MTR_089_S44_GC_Inh    MTR_090_S45_GC_Inh    MTR_091_S46_GC_Inh 
##              41495325              46713448              45482531 
##   MTR_092_S47_Estr_Ag   MTR_093_S48_Estr_Ag   MTR_094_S49_Estr_Ag 
##              33976888              36201983              39420602 
##   MTR_095_S50_Estr_Ag  MTR_096_S51_Estr_Inh  MTR_097_S52_Estr_Inh 
##              34215705              32466391              34977751 
##  MTR_098_S53_Estr_Inh  MTR_099_S54_Estr_Inh   MTR_100_S55_Thyr_Ag 
##              32309780              42816826              44343983 
##   MTR_101_S56_Thyr_Ag   MTR_102_S57_Thyr_Ag   MTR_103_S58_Thyr_Ag 
##              44983604              47883046              32958622 
##  MTR_104_S59_Thyr_Inh  MTR_105_S60_Thyr_Inh  MTR_106_S61_Thyr_Inh 
##              33828268              36810075              35816266 
##  MTR_107_S62_Thyr_Inh      MTR_108_S63_DMSO      MTR_109_S64_DMSO 
##              39757302              45479386              40315145
SE_Filt <- filterSE(SE_Bio, cpmTh=CpmFilt, sampleTh=SampleFilt)
## UPDATED library.size slot

3.2.3 Normalize

Re-calculate library size and calculate TMM
Calculate cpm and logcpm (with normalization) and store in SE_Filt
Calculate Fpkm and logfpkm

SE_Filt <- normTmmSE(SE_Filt, useNormFactors=TRUE, priorCount=0.25)  

assays(SE_Filt)$fpkm <- edgeR::rpkm(SE_Filt, normalized.lib.sizes = TRUE, gene.length=rowData(SE_Filt)$GeneLength)
assays(SE_Filt)$logfpkm <- edgeR::rpkm(SE_Filt, normalized.lib.sizes = TRUE, gene.length=rowData(SE_Filt)$GeneLength, 
                                  log=TRUE, prior.count=0.25)

After filtering out lowly-expressed genes, the dataset is structured in 13532 genes measured in 66 samples.

4. Heatmaps: pre-processing

4.1 Gene name as row names

rownames(SE_Filt) <- rowData(SE_Filt)$GeneName
SE_Filt <- SE_Filt[!duplicated(rownames(SE_Filt)), ]

4.2 Calculate the fold-change

Calculate the fold-change compared to the controls (DMSO). This is done on logcpm.

DMSO <- as.vector(which(SE_Filt$Condition=='DMSO'))
SE_Filt <- SEtools::log2FC(SE_Filt, fromAssay='logcpm', controls=DMSO, isLog=TRUE, toAssay='log2FC')

4.3 Define the color annotation

colData(SE_Filt)$Condition <- factor(colData(SE_Filt)$Condition, levels=c("CTL", "DMSO", "AhHyd_Ag", "AhHyd_Inh", "Andr_Ag", "Andr_Inh", "Estr_Ag", "Estr_Inh", "GC_Ag", "GC_Inh", "LivX_Ag", "LivX_Inh", "Ret_Ag", "Ret_Inh", "Thyr_Ag", "Thyr_Inh" ))

metadata(SE_Filt)$anno_colors <- list(Condition = c('DMSO' = 'grey30', 'CTL' = 'azure3', 

                 'AhHyd_Ag'='#F8766D', 'AhHyd_Inh'='#F8766D50',

                 'Andr_Ag'='#fccb17', 'Andr_Inh'='#C49A0050',  

                 "Estr_Ag"= '#53B400', "Estr_Inh"= '#53B40050', 

                 'GC_Ag' = '#00C094', 'GC_Inh' = '#00C09450',

                 'LivX_Ag' = '#00B6EB', 'LivX_Inh' = '#00B6EB50', 

                 'Ret_Ag' = '#A58AFF', 'Ret_Inh' = '#A58AFF50', 

                 'Thyr_Ag' = '#FB61D7', 'Thyr_Inh' = '#FB61D750'

                 )
 )

5. Heatmaps

5.1 Gene signature

Genes <- GS %>% dplyr::pull(GeneName)

Genes[! Genes %in% row.names(SE_Filt)]
##  [1] "CYP11A1" "CYP17A1" "CYP11B1" "CYP11B2" "CYP21A1" "HSD3B2"  "AKR1C4" 
##  [8] "HSD11B1" "HSD11B2" "HSD17B3" "HSD3B1"  "AKR1C4"  "SRD5A2"  "CYP19A1"
## [15] "HMGCS2"  "IDI2"

rowData(SE_Filt)$Signatures <- NA
for (i in which(rowData(SE_Filt)$GeneName %in% Genes)){
  gene <- row.names(SE_Filt)[i]
  rowData(SE_Filt)[i, 'Signatures'] <-GS[GS$GeneName==gene, "Population"]
}

#color for signatures
metadata(SE_Filt)$anno_colors$Signatures <- c("Steroid biogenesis" = "#e6194b", "Cholesterol synthesis" = "#4363d8")

#color for heatmaps
ScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')

The following genes are not selected as expressed: CYP11A1, CYP17A1, CYP11B1, CYP11B2, CYP21A1, HSD3B2, AKR1C4, HSD11B1, HSD11B2, HSD17B3, HSD3B1, AKR1C4, SRD5A2, CYP19A1, HMGCS2, IDI2.

5.2 Expression levels: logfpkm without scaling

break_coord <- assays(SE_Filt[row.names(SE_Filt) %in% Genes, ])$logfpkm %>% quantile(seq(0,1, by=0.1))
ExpCols <- viridisLite::cividis(length(seq(head(break_coord,1), tail(break_coord,1), by=0.2)))

p1 <- sechm::sechm(SE_Filt, features=Genes, assayName="logfpkm", gaps_at = "Condition", gaps_row="Signatures", top_annotation = c("Condition"), show_rownames = TRUE, left_annotation = c("Signatures"), sortRowsOn=NULL, cluster_rows=FALSE, cluster_cols = FALSE,  show_colnames=TRUE, hmcols=ExpCols, breaks=seq(head(break_coord,1), tail(break_coord,1), by=0.2), column_title = "Logfpkm")
# breaks FALSE to avoid a symmetrical scale
p1

Saving to PDF file

5.3 Expression levels: scaled logfpkm

p2 <- sechm::sechm(SE_Filt, features=Genes, assayName="logfpkm", gaps_at = "Condition", gaps_row="Signatures", top_annotation = c("Condition"), show_rownames = TRUE, left_annotation = c("Signatures", "UpregulatedBy", "DownregulatedBy"), sortRowsOn=NULL, cluster_rows=FALSE, cluster_cols = FALSE,  show_colnames=TRUE, hmcols=ScaledCols, do.scale=TRUE, breaks=0.8, column_title = "Scaled Logfpkm", )
p2

Saving to PDF file

5.4 Expression levels: Log2FC versus DMSO

p3 <- sechm::sechm(SE_Filt, features=Genes, assayName="log2FC", gaps_at = "Condition", gaps_row="Signatures", top_annotation = c("Condition"), show_rownames = TRUE, left_annotation = c("Signatures", "UpregulatedBy", "DownregulatedBy"), sortRowsOn=NULL, cluster_rows=FALSE, cluster_cols = FALSE,  show_colnames=TRUE, hmcols=ScaledCols, 
      do.scale=FALSE, breaks = 0.85, column_title = "LogFC versus DMSO")
p3

Saving to PDF file

6. Savings

SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(OutputFolder, '/', Dataset, 'SigMetab_Workspace.RData'))

Date

## [1] "Fri Jul 18 13:19:03 2025"

SessionInfo

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidyHeatmap_1.8.1           ggplot2_3.4.1              
##  [3] randomcoloR_1.1.0.1         SEtools_1.12.0             
##  [5] sechm_1.6.0                 SummarizedExperiment_1.28.0
##  [7] Biobase_2.58.0              GenomicRanges_1.50.2       
##  [9] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [11] S4Vectors_0.36.1            BiocGenerics_0.44.0        
## [13] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## [15] gridExtra_2.3               DT_0.27                    
## [17] RNASeqBulkExploratory_0.2.1
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.16             colorspace_2.1-0       rjson_0.2.21          
##   [4] ellipsis_0.3.2         circlize_0.4.15        XVector_0.38.0        
##   [7] GlobalOptions_0.1.2    clue_0.3-64            rstudioapi_0.14       
##  [10] bit64_4.0.5            AnnotationDbi_1.60.0   fansi_1.0.4           
##  [13] splines_4.2.1          codetools_0.2-19       doParallel_1.0.17     
##  [16] cachem_1.0.7           geneplotter_1.76.0     knitr_1.42            
##  [19] jsonlite_1.8.4         annotate_1.76.0        cluster_2.1.4         
##  [22] png_0.1-8              pheatmap_1.0.12        compiler_4.2.1        
##  [25] httr_1.4.5             Matrix_1.5-3           fastmap_1.1.1         
##  [28] limma_3.54.1           cli_3.6.1              htmltools_0.5.4       
##  [31] tools_4.2.1            gtable_0.3.1           glue_1.6.2            
##  [34] GenomeInfoDbData_1.2.9 dplyr_1.1.0            V8_4.2.2              
##  [37] Rcpp_1.0.10            jquerylib_0.1.4        vctrs_0.6.2           
##  [40] Biostrings_2.66.0      nlme_3.1-162           crosstalk_1.2.0       
##  [43] iterators_1.0.14       xfun_0.37              stringr_1.5.0         
##  [46] openxlsx_4.2.5.2       lifecycle_1.0.3        dendextend_1.16.0     
##  [49] XML_3.99-0.13          ca_0.71.1              edgeR_3.40.2          
##  [52] zlibbioc_1.44.0        scales_1.2.1           TSP_1.2-2             
##  [55] parallel_4.2.1         RColorBrewer_1.1-3     ComplexHeatmap_2.14.0 
##  [58] yaml_2.3.7             curl_5.0.0             memoise_2.0.1         
##  [61] sass_0.4.5             stringi_1.7.12         RSQLite_2.3.0         
##  [64] highr_0.10             genefilter_1.80.3      foreach_1.5.2         
##  [67] seriation_1.4.1        zip_2.2.2              BiocParallel_1.32.5   
##  [70] shape_1.4.6            rlang_1.1.1            pkgconfig_2.0.3       
##  [73] bitops_1.0-7           evaluate_0.20          lattice_0.20-45       
##  [76] purrr_1.0.1            patchwork_1.1.2        htmlwidgets_1.6.1     
##  [79] bit_4.0.5              tidyselect_1.2.0       magrittr_2.0.3        
##  [82] DESeq2_1.38.3          R6_2.5.1               generics_0.1.3        
##  [85] DelayedArray_0.24.0    DBI_1.1.3              withr_2.5.0           
##  [88] mgcv_1.8-41            pillar_1.8.1           survival_3.5-3        
##  [91] KEGGREST_1.38.0        RCurl_1.98-1.10        tibble_3.2.1          
##  [94] crayon_1.5.2           utf8_1.2.3             rmarkdown_2.20        
##  [97] viridis_0.6.2          GetoptLong_1.0.5       locfit_1.5-9.7        
## [100] grid_4.2.1             sva_3.46.0             data.table_1.14.8     
## [103] blob_1.2.3             digest_0.6.31          xtable_1.8-4          
## [106] tidyr_1.3.0            munsell_0.5.0          viridisLite_0.4.1     
## [109] registry_0.5-1         bslib_0.4.2

EndPoints CTL04: Metabolism Gene Signatures

Date: July 18, 2025

Checking gene signatures on CTL04

1. Environment Set Up

2. Data Upload

2.1 SE object

2.2 Gene Lenght

2.3 Gene Signatures

3. Selecting and filtering

3.1 Sample Selection

3.2 Gene Filtering

3.2.1 Setting gene filtering thresholds

3.2.2 Filter out low expressed genes

3.2.3 Normalize

4. Heatmaps: pre-processing

4.1 Gene name as row names

4.2 Calculate the fold-change

4.3 Define the color annotation

5. Heatmaps

5.1 Gene signature

5.2 Expression levels: logfpkm without scaling

5.3 Expression levels: scaled logfpkm

5.4 Expression levels: Log2FC versus DMSO

6. Savings