EndPoints CTL08: Metabolism Gene Signatures

Checking gene signatures on CTL08

1. Environment Set Up

Values of RMarkdown parameters

for (i in 1:length(params))
  print(paste('Parameter:', names(params)[i], ' - Value:', params[[i]], '- Class:', class(params[[i]])))
## [1] "Parameter: Dataset  - Value: CTL08 - Class: character"
## [1] "Parameter: SE  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/CTL08/Output/AfterSamplesSelection/MaleLineSE_Bio.rds - Class: character"
## [1] "Parameter: OutputFolder  - Value: ~/DataDir/bulkRNASeq/4.GeneSignatures/CTL08/Output/AfterSamplesSelection/MetabolismSig/ - Class: character"
## [1] "Parameter: CpmFilt  - Value: 2 - Class: integer"
## [1] "Parameter: SampleFilt  - Value: 4 - Class: integer"

library(RNASeqBulkExploratory)  #Our Package
library(DT)
library(gridExtra)
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(SEtools)
## Loading required package: sechm
library(sechm)
library(randomcoloR)
library(ggplot2)
library(tidyHeatmap)
## ========================================
## tidyHeatmap version 1.8.1
## If you use tidyHeatmap in published research, please cite:
## 1) Mangiola et al. tidyHeatmap: an R package for modular heatmap production 
##   based on tidy principles. JOSS 2020.
## 2) Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(tidyHeatmap))
## ========================================
## 
## Attaching package: 'tidyHeatmap'
## The following object is masked from 'package:stats':
## 
##     heatmap

Dataset <- params$Dataset
OutputFolder <- params$OutputFolder

if (dir.exists(OutputFolder) == FALSE) {
  dir.create(OutputFolder, recursive=FALSE)
}

2. Data Upload

2.1 SE object

SE object created from exploratory analysis. Steps already performed: gene selection according to biotype.

SE_Bio <- readRDS(params$SE)

SE object containing information about 19900 genes in 51 samples.

if(! identical(names(assays(SE_Bio)$counts), rownames(colData(SE_Bio)))){
  stop('Inconsistencies in sample metadata for SE object')
  }

if(! identical(names(assays(SE_Bio)$counts), colData(SE_Bio)$HRID)){
  stop('Inconsistencies in sample metadata for SE object')
  }

2.2 Gene Lenght

GeneLength <- read.table(file = '~/DataDir/bulkRNASeq/1.GeneLength/Output/GenecodeV35GeneLength.txt', 
                         sep='\t', header=TRUE)

SelLenght <- dplyr::left_join(as.data.frame(rowData(SE_Bio))[,1:3], GeneLength, by=c('Gene'='Gene'))

if(identical(SelLenght$Gene, rowData(SE_Bio)$Gene)){
  rowData(SE_Bio)$GeneLength <- SelLenght$Length
  print('Gene Lengh information added to SE')
} else {
    print('Information on gene lenght is not compatible with SE object. Gene Lenght information not added to SE')
  }
## [1] "Gene Lengh information added to SE"

2.3 Gene Signatures

GS <- read.table(file = '~/DataDir/bulkRNASeq/4.GeneSignatures/GeneSets/MetabolismGenes.txt', 
                 sep='\t', header=TRUE, row.names = NULL)
names(GS)[3] <- 'Population'

GS %>%  
  data.frame() %>%
  DT::datatable()

3. Selecting and filtering

3.1 Sample Selection

Already performed.

3.2 Gene Filtering

3.2.1 Setting gene filtering thresholds

CpmFilt <- params$CpmFilt
if(is.numeric(CpmFilt)==FALSE){
  stop('ERROR! Cpm threshold for gene filtering has a non-numeric value')
}

SampleFilt <- ifelse(params$SampleFilt=='Default', dim(SE_Bio)[2]/2, params$SampleFilt)
if(is.numeric(SampleFilt)==FALSE){
  stop('ERROR! Sample threshold for gene filtering has a non-numeric value')
}

Genes having an expression lower than 2 in at least 4 samples are filtered out.

3.2.2 Filter out low expressed genes

Compute temporary cpm (without TMM normalization, not stored in SE)
Filter out lowly-expressed genes

SE_Bio$lib.size
##        S32829_H12_DMSO      S32830_H14_Ret_Ag      S32831_H15_Ret_Ag 
##               35613673               59279739               70826142 
##    S32833_H27_Thyr_Inh    S32837_H35_LivX_Inh    S32838_H36_LivX_Inh 
##               67049867               56634424               68940854 
##     S32839_H38_LivX_Ag     S32840_H39_LivX_Ag     S32841_H41_Estr_Ag 
##               53964028               36461411               63518569 
##     S32842_H42_Estr_Ag     S32845_H51_Andr_Ag    S32846_H52_Andr_Inh 
##               42604865               45209919               85402393 
##    S32847_H53_Andr_Inh    S32848_H54_Andr_Inh      S32849_H60_GC_Inh 
##               64432453               30728121               76458966 
##   S32853_H68_AhHyd_Inh   S32854_H69_AhHyd_Inh     S32857_H22_Thyr_Ag 
##               79053007               28308197               51578374 
##    S32858_H25_Thyr_Inh         S32860_H9_DMSO     S32861_H17_Ret_Inh 
##               69682372               54069427               68212493 
##     S32862_H18_Ret_Inh      S32864_H13_Ret_Ag     S32865_H23_Thyr_Ag 
##               35022905               27114632               36213800 
##       S35870_MTR3_DMSO   S35872_MTR5_Thyr_Inh   S35875_MTR8_LivX_Inh 
##               33548410               48558405               56970827 
##    S35876_MTR9_LivX_Ag   S35879_MTR12_Andr_Ag    S35881_MTR14_GC_Inh 
##               52549588               40765101               55221358 
## S35882_MTR15_AhHyd_Inh          MTR16_S34_CTL          MTR17_S38_CTL 
##               52843891               29163508               35691834 
##          MTR19_S41_CTL          MTR20_S48_CTL          MTR21_S29_CTL 
##               38249382               23658993               38700856 
##         MTR22_S46_DMSO         MTR23_S28_DMSO         MTR24_S51_DMSO 
##               36339476               31756335               28625488 
##         MTR25_S43_DMSO         MTR26_S33_DMSO      MTR27_S53_Thyr_Ag 
##               32823794               29031507               36025706 
##      MTR28_S50_Thyr_Ag      MTR29_S30_Andr_Ag      MTR30_S42_Andr_Ag 
##               36156859               38661303               37547288 
##        MTR31_S47_GC_Ag        MTR32_S35_GC_Ag        MTR33_S37_GC_Ag 
##               39385341               31560063               23287199 
##     MTR37_S52_AhHyd_Ag     MTR38_S44_AhHyd_Ag     MTR39_S49_AhHyd_Ag 
##               33521688               34704497               32385629
SE_Filt <- filterSE(SE_Bio, cpmTh=CpmFilt, sampleTh=SampleFilt)
## UPDATED library.size slot

3.2.3 Normalize

Re-calculate library size and calculate TMM
Calculate cpm and logcpm (with normalization) and store in SE_Filt
Calculate Fpkm and logfpkm

SE_Filt <- normTmmSE(SE_Filt, useNormFactors=TRUE, priorCount=0.25)  

assays(SE_Filt)$fpkm <- edgeR::rpkm(SE_Filt, normalized.lib.sizes = TRUE, gene.length=rowData(SE_Filt)$GeneLength)
assays(SE_Filt)$logfpkm <- edgeR::rpkm(SE_Filt, normalized.lib.sizes = TRUE, gene.length=rowData(SE_Filt)$GeneLength, 
                                  log=TRUE, prior.count=0.25)

After filtering out lowly-expressed genes, the dataset is structured in 14213 genes measured in 51 samples.

4. Heatmaps: pre-processing

4.1 Gene name as row names

rownames(SE_Filt) <- rowData(SE_Filt)$GeneName
SE_Filt <- SE_Filt[!duplicated(rownames(SE_Filt)), ]

4.2 Calculate the fold-change

Calculate the fold-change compared to the controls (DMSO). This is done on logcpm.

DMSO <- as.vector(which(SE_Filt$Condition=='DMSO'))
SE_Filt <- SEtools::log2FC(SE_Filt, fromAssay='logcpm', controls=DMSO, isLog=TRUE, toAssay='log2FC')

4.3 Define the color annotation

colData(SE_Filt)$Condition <- factor(colData(SE_Filt)$Condition, levels=c("CTL", "DMSO", "AhHyd_Ag", "AhHyd_Inh", "Andr_Ag", "Andr_Inh", "Estr_Ag", "Estr_Inh", "GC_Ag", "GC_Inh", "LivX_Ag", "LivX_Inh", "Ret_Ag", "Ret_Inh", "Thyr_Ag", "Thyr_Inh" ))

metadata(SE_Filt)$anno_colors <- list(Condition = c('DMSO' = 'grey30', 'CTL' = 'azure3', 

                 'AhHyd_Ag'='#F8766D', 'AhHyd_Inh'='#F8766D50',

                 'Andr_Ag'='#fccb17', 'Andr_Inh'='#C49A0050',  

                 "Estr_Ag"= '#53B400', "Estr_Inh"= '#53B40050', 

                 'GC_Ag' = '#00C094', 'GC_Inh' = '#00C09450',

                 'LivX_Ag' = '#00B6EB', 'LivX_Inh' = '#00B6EB50', 

                 'Ret_Ag' = '#A58AFF', 'Ret_Inh' = '#A58AFF50', 

                 'Thyr_Ag' = '#FB61D7', 'Thyr_Inh' = '#FB61D750'

                 )
 , SeqRun = c("20210310" = "orange", "20210724"  = "green4", "20220422" = 
                                          "#66ACB7"))

5. Heatmaps

5.1 Gene signature

Genes <- GS %>% dplyr::pull(GeneName)

Genes[! Genes %in% row.names(SE_Filt)]
##  [1] "CYP11A1" "CYP17A1" "CYP11B1" "CYP11B2" "CYP21A1" "HSD3B2"  "AKR1C4" 
##  [8] "HSD11B1" "HSD17B3" "HSD3B1"  "AKR1C3"  "AKR1C4"  "SRD5A2"  "STAR"   
## [15] "CYP19A1" "IDI2"

rowData(SE_Filt)$Signatures <- NA
for (i in which(rowData(SE_Filt)$GeneName %in% Genes)){
  gene <- row.names(SE_Filt)[i]
  rowData(SE_Filt)[i, 'Signatures'] <-GS[GS$GeneName==gene, "Population"]
}

#color for signatures
metadata(SE_Filt)$anno_colors$Signatures <- c("Steroid biogenesis" = "#e6194b", "Cholesterol synthesis" = "#4363d8")

#color for heatmaps
ScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')

The following genes are not selected as expressed: CYP11A1, CYP17A1, CYP11B1, CYP11B2, CYP21A1, HSD3B2, AKR1C4, HSD11B1, HSD17B3, HSD3B1, AKR1C3, AKR1C4, SRD5A2, STAR, CYP19A1, IDI2.

5.2 Expression levels: logfpkm without scaling

break_coord <- assays(SE_Filt[row.names(SE_Filt) %in% Genes, ])$logfpkm %>% quantile(seq(0,1, by=0.1))
ExpCols <- viridisLite::cividis(length(seq(head(break_coord,1), tail(break_coord,1), by=0.2)))

p1 <- sechm::sechm(SE_Filt, features=Genes, assayName="logfpkm", gaps_at = "Condition", gaps_row="Signatures", top_annotation = c("Condition", "SeqRun"), show_rownames = TRUE, left_annotation = c("Signatures"), sortRowsOn=NULL, cluster_rows=FALSE, cluster_cols = FALSE,  show_colnames=TRUE, hmcols=ExpCols, breaks=seq(head(break_coord,1), tail(break_coord,1), by=0.2), column_title = "Logfpkm")
# breaks FALSE to avoid a symmetrical scale
p1

Saving to PDF file

5.3 Expression levels: scaled logfpkm

p2 <- sechm::sechm(SE_Filt, features=Genes, assayName="logfpkm", gaps_at = "Condition", gaps_row="Signatures", top_annotation = c("Condition", "SeqRun"), show_rownames = TRUE, left_annotation = c("Signatures", "UpregulatedBy", "DownregulatedBy"), sortRowsOn=NULL, cluster_rows=FALSE, cluster_cols = FALSE,  show_colnames=TRUE, hmcols=ScaledCols, do.scale=TRUE, breaks=0.8, column_title = "Scaled Logfpkm", )
p2

Saving to PDF file

5.4 Expression levels: Log2FC versus DMSO

p3 <- sechm::sechm(SE_Filt, features=Genes, assayName="log2FC", gaps_at = "Condition", gaps_row="Signatures", top_annotation = c("Condition", "SeqRun"), show_rownames = TRUE, left_annotation = c("Signatures", "UpregulatedBy", "DownregulatedBy"), sortRowsOn=NULL, cluster_rows=FALSE, cluster_cols = FALSE,  show_colnames=TRUE, hmcols=ScaledCols, 
      do.scale=FALSE, breaks = 0.85, column_title = "LogFC versus DMSO")
p3

Saving to PDF file

6. Savings

SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(OutputFolder, '/', Dataset, 'SigMetab_Workspace.RData'))

Date

## [1] "Fri Jul 18 13:27:31 2025"

SessionInfo

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidyHeatmap_1.8.1           ggplot2_3.4.1              
##  [3] randomcoloR_1.1.0.1         SEtools_1.12.0             
##  [5] sechm_1.6.0                 SummarizedExperiment_1.28.0
##  [7] Biobase_2.58.0              GenomicRanges_1.50.2       
##  [9] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [11] S4Vectors_0.36.1            BiocGenerics_0.44.0        
## [13] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## [15] gridExtra_2.3               DT_0.27                    
## [17] RNASeqBulkExploratory_0.2.1
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.16             colorspace_2.1-0       rjson_0.2.21          
##   [4] ellipsis_0.3.2         circlize_0.4.15        XVector_0.38.0        
##   [7] GlobalOptions_0.1.2    clue_0.3-64            rstudioapi_0.14       
##  [10] bit64_4.0.5            AnnotationDbi_1.60.0   fansi_1.0.4           
##  [13] splines_4.2.1          codetools_0.2-19       doParallel_1.0.17     
##  [16] cachem_1.0.7           geneplotter_1.76.0     knitr_1.42            
##  [19] jsonlite_1.8.4         annotate_1.76.0        cluster_2.1.4         
##  [22] png_0.1-8              pheatmap_1.0.12        compiler_4.2.1        
##  [25] httr_1.4.5             Matrix_1.5-3           fastmap_1.1.1         
##  [28] limma_3.54.1           cli_3.6.1              htmltools_0.5.4       
##  [31] tools_4.2.1            gtable_0.3.1           glue_1.6.2            
##  [34] GenomeInfoDbData_1.2.9 dplyr_1.1.0            V8_4.2.2              
##  [37] Rcpp_1.0.10            jquerylib_0.1.4        vctrs_0.6.2           
##  [40] Biostrings_2.66.0      nlme_3.1-162           crosstalk_1.2.0       
##  [43] iterators_1.0.14       xfun_0.37              stringr_1.5.0         
##  [46] openxlsx_4.2.5.2       lifecycle_1.0.3        dendextend_1.16.0     
##  [49] XML_3.99-0.13          ca_0.71.1              edgeR_3.40.2          
##  [52] zlibbioc_1.44.0        scales_1.2.1           TSP_1.2-2             
##  [55] parallel_4.2.1         RColorBrewer_1.1-3     ComplexHeatmap_2.14.0 
##  [58] yaml_2.3.7             curl_5.0.0             memoise_2.0.1         
##  [61] sass_0.4.5             stringi_1.7.12         RSQLite_2.3.0         
##  [64] highr_0.10             genefilter_1.80.3      foreach_1.5.2         
##  [67] seriation_1.4.1        zip_2.2.2              BiocParallel_1.32.5   
##  [70] shape_1.4.6            rlang_1.1.1            pkgconfig_2.0.3       
##  [73] bitops_1.0-7           evaluate_0.20          lattice_0.20-45       
##  [76] purrr_1.0.1            patchwork_1.1.2        htmlwidgets_1.6.1     
##  [79] bit_4.0.5              tidyselect_1.2.0       magrittr_2.0.3        
##  [82] DESeq2_1.38.3          R6_2.5.1               generics_0.1.3        
##  [85] DelayedArray_0.24.0    DBI_1.1.3              withr_2.5.0           
##  [88] mgcv_1.8-41            pillar_1.8.1           survival_3.5-3        
##  [91] KEGGREST_1.38.0        RCurl_1.98-1.10        tibble_3.2.1          
##  [94] crayon_1.5.2           utf8_1.2.3             rmarkdown_2.20        
##  [97] viridis_0.6.2          GetoptLong_1.0.5       locfit_1.5-9.7        
## [100] grid_4.2.1             sva_3.46.0             data.table_1.14.8     
## [103] blob_1.2.3             digest_0.6.31          xtable_1.8-4          
## [106] tidyr_1.3.0            munsell_0.5.0          viridisLite_0.4.1     
## [109] registry_0.5-1         bslib_0.4.2

EndPoints CTL08: Metabolism Gene Signatures

Date: July 18, 2025

Checking gene signatures on CTL08

1. Environment Set Up

2. Data Upload

2.1 SE object

2.2 Gene Lenght

2.3 Gene Signatures

3. Selecting and filtering

3.1 Sample Selection

3.2 Gene Filtering

3.2.1 Setting gene filtering thresholds

3.2.2 Filter out low expressed genes

3.2.3 Normalize

4. Heatmaps: pre-processing

4.1 Gene name as row names

4.2 Calculate the fold-change

4.3 Define the color annotation

5. Heatmaps

5.1 Gene signature

5.2 Expression levels: logfpkm without scaling

5.3 Expression levels: scaled logfpkm

5.4 Expression levels: Log2FC versus DMSO

6. Savings