Checking gene signatures on CTL08

1. Environment Set Up

Values of RMarkdown parameters

for (i in 1:length(params))
  print(paste('Parameter:', names(params)[i], ' - Value:', params[[i]], '- Class:', class(params[[i]])))
## [1] "Parameter: Dataset  - Value: CTL08 - Class: character"
## [1] "Parameter: SE  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/CTL08/Output/AfterSamplesSelection/MaleLineSE_Bio.rds - Class: character"
## [1] "Parameter: OutputFolder  - Value: ~/DataDir/bulkRNASeq/4.GeneSignatures/CTL08/Output/AfterSamplesSelection/ReceptorSig/ - Class: character"
## [1] "Parameter: CpmFilt  - Value: 2 - Class: integer"
## [1] "Parameter: SampleFilt  - Value: 4 - Class: integer"
library(RNASeqBulkExploratory)  #Our Package
library(DT)
library(gridExtra)
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(SEtools)
## Loading required package: sechm
library(sechm)
library(randomcoloR)
library(ggplot2)
library(tidyHeatmap)
## ========================================
## tidyHeatmap version 1.8.1
## If you use tidyHeatmap in published research, please cite:
## 1) Mangiola et al. tidyHeatmap: an R package for modular heatmap production 
##   based on tidy principles. JOSS 2020.
## 2) Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(tidyHeatmap))
## ========================================
## 
## Attaching package: 'tidyHeatmap'
## The following object is masked from 'package:stats':
## 
##     heatmap
Dataset <- params$Dataset
OutputFolder <- params$OutputFolder

if (dir.exists(OutputFolder) == FALSE) {
  dir.create(OutputFolder, recursive=FALSE)
}

2. Data Upload

2.1 SE object

SE object created from exploratory analysis. Steps already performed: gene selection according to biotype.

SE_Bio <- readRDS(params$SE)

SE object containing information about 19900 genes in 51 samples.

if(! identical(names(assays(SE_Bio)$counts), rownames(colData(SE_Bio)))){
  stop('Inconsistencies in sample metadata for SE object')
  }

if(! identical(names(assays(SE_Bio)$counts), colData(SE_Bio)$HRID)){
  stop('Inconsistencies in sample metadata for SE object')
  }

2.2 Gene Lenght

GeneLength <- read.table(file = '~/DataDir/bulkRNASeq/1.GeneLength/Output/GenecodeV35GeneLength.txt', 
                         sep='\t', header=TRUE)
SelLenght <- dplyr::left_join(as.data.frame(rowData(SE_Bio))[,1:3], GeneLength, by=c('Gene'='Gene'))
if(identical(SelLenght$Gene, rowData(SE_Bio)$Gene)){
  rowData(SE_Bio)$GeneLength <- SelLenght$Length
  print('Gene Lengh information added to SE')
} else {
    print('Information on gene lenght is not compatible with SE object. Gene Lenght information not added to SE')
  }
## [1] "Gene Lengh information added to SE"

2.3 Gene Signatures

GS <- read.table(file = '~/DataDir/bulkRNASeq/4.GeneSignatures/GeneSets/ReceptorGenes.txt', 
                         sep='\t', header=TRUE)
names(GS)[2] <- 'Population'

table(GS$Population)
## 
##         AhHyd          Andr      Estrogen            GC          LivX 
##             1             1             7             1             2 
## Retinoic Acid       Thyroid 
##             7            11

3. Selecting and filtering

3.1 Sample Selection

Not necessary.

3.2 Gene Filtering

3.2.1 Setting gene filtering thresholds

CpmFilt <- params$CpmFilt
if(is.numeric(CpmFilt)==FALSE){
  stop('ERROR! Cpm threshold for gene filtering has a non-numeric value')
}

SampleFilt <- ifelse(params$SampleFilt=='Default', dim(SE_Bio)[2]/2, params$SampleFilt)
if(is.numeric(SampleFilt)==FALSE){
  stop('ERROR! Sample threshold for gene filtering has a non-numeric value')
}

Genes having an expression lower than 2 in at least 4 samples are filtered out.

3.2.2 Filter out low expressed genes

  • Compute temporary cpm (without TMM normalization, not stored in SE)
  • Filter out lowly-expressed genes
SE_Bio$lib.size
##        S32829_H12_DMSO      S32830_H14_Ret_Ag      S32831_H15_Ret_Ag 
##               35613673               59279739               70826142 
##    S32833_H27_Thyr_Inh    S32837_H35_LivX_Inh    S32838_H36_LivX_Inh 
##               67049867               56634424               68940854 
##     S32839_H38_LivX_Ag     S32840_H39_LivX_Ag     S32841_H41_Estr_Ag 
##               53964028               36461411               63518569 
##     S32842_H42_Estr_Ag     S32845_H51_Andr_Ag    S32846_H52_Andr_Inh 
##               42604865               45209919               85402393 
##    S32847_H53_Andr_Inh    S32848_H54_Andr_Inh      S32849_H60_GC_Inh 
##               64432453               30728121               76458966 
##   S32853_H68_AhHyd_Inh   S32854_H69_AhHyd_Inh     S32857_H22_Thyr_Ag 
##               79053007               28308197               51578374 
##    S32858_H25_Thyr_Inh         S32860_H9_DMSO     S32861_H17_Ret_Inh 
##               69682372               54069427               68212493 
##     S32862_H18_Ret_Inh      S32864_H13_Ret_Ag     S32865_H23_Thyr_Ag 
##               35022905               27114632               36213800 
##       S35870_MTR3_DMSO   S35872_MTR5_Thyr_Inh   S35875_MTR8_LivX_Inh 
##               33548410               48558405               56970827 
##    S35876_MTR9_LivX_Ag   S35879_MTR12_Andr_Ag    S35881_MTR14_GC_Inh 
##               52549588               40765101               55221358 
## S35882_MTR15_AhHyd_Inh          MTR16_S34_CTL          MTR17_S38_CTL 
##               52843891               29163508               35691834 
##          MTR19_S41_CTL          MTR20_S48_CTL          MTR21_S29_CTL 
##               38249382               23658993               38700856 
##         MTR22_S46_DMSO         MTR23_S28_DMSO         MTR24_S51_DMSO 
##               36339476               31756335               28625488 
##         MTR25_S43_DMSO         MTR26_S33_DMSO      MTR27_S53_Thyr_Ag 
##               32823794               29031507               36025706 
##      MTR28_S50_Thyr_Ag      MTR29_S30_Andr_Ag      MTR30_S42_Andr_Ag 
##               36156859               38661303               37547288 
##        MTR31_S47_GC_Ag        MTR32_S35_GC_Ag        MTR33_S37_GC_Ag 
##               39385341               31560063               23287199 
##     MTR37_S52_AhHyd_Ag     MTR38_S44_AhHyd_Ag     MTR39_S49_AhHyd_Ag 
##               33521688               34704497               32385629
SE_Filt <- filterSE(SE_Bio, cpmTh=CpmFilt, sampleTh=SampleFilt)
## UPDATED library.size slot
#SE_Filt$lib.size

3.2.3 Normalize

  • Re-calculate library size and calculate TMM
  • Calculate cpm and logcpm (with normalization) and store in SE_Filt
  • Calculate Fpkm and logfpkm
SE_Filt <- normTmmSE(SE_Filt, useNormFactors=TRUE, priorCount=0.25)  

assays(SE_Filt)$fpkm <- edgeR::rpkm(SE_Filt, normalized.lib.sizes = TRUE, gene.length=rowData(SE_Filt)$GeneLength)
assays(SE_Filt)$logfpkm <- edgeR::rpkm(SE_Filt, normalized.lib.sizes = TRUE, gene.length=rowData(SE_Filt)$GeneLength, 
                                  log=TRUE, prior.count=0.25)

After filtering out lowly-expressed genes, the dataset is structured in 14213 genes measured in 51 samples.


4. Heatmaps: pre-processing

4.1 Gene name as row names

rownames(SE_Filt) <- rowData(SE_Filt)$GeneName
SE_Filt <- SE_Filt[!duplicated(rownames(SE_Filt)), ] 

4.2 Calculate the fold-change

Calculate the fold-change compared to the controls (DMSO). This is done on logcpm.

DMSO <- as.vector(which(SE_Filt$Condition=='DMSO'))
SE_Filt <- SEtools::log2FC(SE_Filt, fromAssay='logcpm', controls=DMSO, isLog=TRUE, toAssay='log2FC')

4.4 Define the color annotation

colData(SE_Filt)$Condition <- factor(colData(SE_Filt)$Condition, levels=c("CTL", "DMSO", "AhHyd_Ag", "AhHyd_Inh", "Andr_Ag", "Andr_Inh", "Estr_Ag", "Estr_Inh", "GC_Ag", "GC_Inh", "LivX_Ag", "LivX_Inh", "Ret_Ag", "Ret_Inh", "Thyr_Ag", "Thyr_Inh" ))

metadata(SE_Filt)$anno_colors <- list(Condition = c('DMSO' = 'grey30', 'CTL' = 'azure3', 

                 'AhHyd_Ag'='#F8766D', 'AhHyd_Inh'='#F8766D50',

                 'Andr_Ag'='#fccb17', 'Andr_Inh'='#C49A0050',  

                 "Estr_Ag"= '#53B400', "Estr_Inh"= '#53B40050', 

                 'GC_Ag' = '#00C094', 'GC_Inh' = '#00C09450',

                 'LivX_Ag' = '#00B6EB', 'LivX_Inh' = '#00B6EB50', 

                 'Ret_Ag' = '#A58AFF', 'Ret_Inh' = '#A58AFF50', 

                 'Thyr_Ag' = '#FB61D7', 'Thyr_Inh' = '#FB61D750'

                 )
 , SeqRun = c("20210310" = "orange", "20210724"  = "green4", "20220422" = 
                                          "#66ACB7"))

5. Receptor genes

5.1 Gene signature

Genes <- GS %>% dplyr::pull(GeneName)

Genes[! Genes %in% row.names(SE_Filt)]
## [1] "DIO1"    "THRSP"   "GPER1"   "ESR1"    "ESR2"    "CYP19A1"

rowData(SE_Filt)$Signatures <- NA
for (i in which(rowData(SE_Filt)$GeneName %in% Genes)){
  gene <- row.names(SE_Filt)[i]
  rowData(SE_Filt)[i, 'Signatures'] <-GS[GS$GeneName==gene, "Population"]
}        

#color for signatures
metadata(SE_Filt)$anno_colors$Signatures <- c(c(AhHyd = "#F8766D", Andr = "#fccb17", Estrogen = "#53B400", GC = "#00C094", LivX = "#00B6EB", "Retinoic Acid" = "#A58AFF", Thyroid = "#FB61D7"))

#color for heatmaps
ScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')

The following genes are not selected as expressed: DIO1, THRSP, GPER1, ESR1, ESR2, CYP19A1.

5.2 Expression levels: logfpkm without scaling

break_coord <- assays(SE_Filt[row.names(SE_Filt) %in% Genes, ])$logfpkm %>% quantile(seq(0,1, by=0.1))
ExpCols <- viridisLite::cividis(length(seq(head(break_coord,1), tail(break_coord,1), by=0.2)))

p1 <- sechm::sechm(SE_Filt, features=Genes, assayName="logfpkm", gaps_at = "Condition", gaps_row="Signatures", top_annotation = c("Condition", "SeqRun"), show_rownames = TRUE, left_annotation = c("Signatures"), sortRowsOn=NULL, cluster_rows=FALSE, cluster_cols = FALSE,  show_colnames=TRUE, hmcols=ExpCols, breaks=seq(head(break_coord,1), tail(break_coord,1), by=0.2), column_title = "Logfpkm")
# breaks FALSE to avoid a symmetrical scale
p1

Saving to PDF file

5.3 Expression levels: scaled logfpkm

p2 <- sechm::sechm(SE_Filt, features=Genes, assayName="logfpkm", gaps_at = "Condition", gaps_row="Signatures", top_annotation = c("Condition", "SeqRun"), show_rownames = TRUE, left_annotation = c("Signatures", "UpregulatedBy", "DownregulatedBy"), sortRowsOn=NULL, cluster_rows=FALSE, cluster_cols = FALSE,  show_colnames=TRUE, hmcols=ScaledCols, do.scale=TRUE, breaks=0.8, column_title = "Scaled Logfpkm")
p2

Saving to PDF file

5.4 Expression levels: Log2FC versus DMSO

p3 <- sechm::sechm(SE_Filt, features=Genes, assayName="log2FC", gaps_at = "Condition", gaps_row="Signatures", top_annotation = c("Condition", "SeqRun"), show_rownames = TRUE, left_annotation = c("Signatures", "UpregulatedBy", "DownregulatedBy"), sortRowsOn=NULL, cluster_rows=FALSE, cluster_cols = FALSE,  show_colnames=TRUE, hmcols=ScaledCols, 
      do.scale=FALSE, breaks = 0.85, column_title = "LogFC versus DMSO")
p3

Saving to PDF file

6. Savings

SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(OutputFolder, '/', Dataset, 'SigReceptor_Workspace.RData'))
Date
## [1] "Fri Jul 18 13:28:13 2025"
SessionInfo
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidyHeatmap_1.8.1           ggplot2_3.4.1              
##  [3] randomcoloR_1.1.0.1         SEtools_1.12.0             
##  [5] sechm_1.6.0                 SummarizedExperiment_1.28.0
##  [7] Biobase_2.58.0              GenomicRanges_1.50.2       
##  [9] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [11] S4Vectors_0.36.1            BiocGenerics_0.44.0        
## [13] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## [15] gridExtra_2.3               DT_0.27                    
## [17] RNASeqBulkExploratory_0.2.1
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.16             colorspace_2.1-0       rjson_0.2.21          
##   [4] circlize_0.4.15        XVector_0.38.0         GlobalOptions_0.1.2   
##   [7] clue_0.3-64            rstudioapi_0.14        bit64_4.0.5           
##  [10] AnnotationDbi_1.60.0   fansi_1.0.4            splines_4.2.1         
##  [13] codetools_0.2-19       doParallel_1.0.17      cachem_1.0.7          
##  [16] geneplotter_1.76.0     knitr_1.42             jsonlite_1.8.4        
##  [19] annotate_1.76.0        cluster_2.1.4          png_0.1-8             
##  [22] pheatmap_1.0.12        compiler_4.2.1         httr_1.4.5            
##  [25] Matrix_1.5-3           fastmap_1.1.1          limma_3.54.1          
##  [28] cli_3.6.1              htmltools_0.5.4        tools_4.2.1           
##  [31] gtable_0.3.1           glue_1.6.2             GenomeInfoDbData_1.2.9
##  [34] dplyr_1.1.0            V8_4.2.2               Rcpp_1.0.10           
##  [37] jquerylib_0.1.4        vctrs_0.6.2            Biostrings_2.66.0     
##  [40] nlme_3.1-162           iterators_1.0.14       xfun_0.37             
##  [43] stringr_1.5.0          openxlsx_4.2.5.2       lifecycle_1.0.3       
##  [46] dendextend_1.16.0      XML_3.99-0.13          ca_0.71.1             
##  [49] edgeR_3.40.2           zlibbioc_1.44.0        scales_1.2.1          
##  [52] TSP_1.2-2              parallel_4.2.1         RColorBrewer_1.1-3    
##  [55] ComplexHeatmap_2.14.0  yaml_2.3.7             curl_5.0.0            
##  [58] memoise_2.0.1          sass_0.4.5             stringi_1.7.12        
##  [61] RSQLite_2.3.0          highr_0.10             genefilter_1.80.3     
##  [64] foreach_1.5.2          seriation_1.4.1        zip_2.2.2             
##  [67] BiocParallel_1.32.5    shape_1.4.6            rlang_1.1.1           
##  [70] pkgconfig_2.0.3        bitops_1.0-7           evaluate_0.20         
##  [73] lattice_0.20-45        purrr_1.0.1            patchwork_1.1.2       
##  [76] htmlwidgets_1.6.1      bit_4.0.5              tidyselect_1.2.0      
##  [79] magrittr_2.0.3         DESeq2_1.38.3          R6_2.5.1              
##  [82] generics_0.1.3         DelayedArray_0.24.0    DBI_1.1.3             
##  [85] withr_2.5.0            mgcv_1.8-41            pillar_1.8.1          
##  [88] survival_3.5-3         KEGGREST_1.38.0        RCurl_1.98-1.10       
##  [91] tibble_3.2.1           crayon_1.5.2           utf8_1.2.3            
##  [94] rmarkdown_2.20         viridis_0.6.2          GetoptLong_1.0.5      
##  [97] locfit_1.5-9.7         grid_4.2.1             sva_3.46.0            
## [100] data.table_1.14.8      blob_1.2.3             digest_0.6.31         
## [103] xtable_1.8-4           tidyr_1.3.0            munsell_0.5.0         
## [106] viridisLite_0.4.1      registry_0.5-1         bslib_0.4.2