Exploratory analysis on FemaleLine

1. Environment Set Up

knitr::opts_chunk$set(echo = TRUE, warning=FALSE)

Values of RMarkdown parameters

for (i in 1:length(params))
  print(paste('Parameter:', names(params)[i], ' - Value:', params[[i]], '- Class:', class(params[[i]])))
## [1] "Parameter: Dataset  - Value: FemaleLine - Class: character"
## [1] "Parameter: CountFile  - Value: ~/DataDir/bulkRNASeq/2.Alignment/SalmonQuant/SeqRun2022.10.18_27_OK/Pipe/SyncOutput/EndPoints_2022.10.18_27_NEW_Counts.txt - Class: character"
## [1] "Parameter: DesignFile  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/CTL04/Input/SeqRun2022.10.18_27SampleSheet.tsv - Class: character"
## [1] "Parameter: GeneAnnotationFile  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/GeneAnnotation/AnnotationEns101.txt - Class: character"
## [1] "Parameter: OutputFolder  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/CTL04/Output/ - Class: character"
## [1] "Parameter: CpmFilt  - Value: 2 - Class: integer"
## [1] "Parameter: SampleFilt  - Value: 4 - Class: integer"
library(RNASeqBulkExploratory)  #Our Package
library(DT)
library(gridExtra)
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(ggplot2)
Dataset <- params$Dataset
OutputFolder <- params$OutputFolder

if (dir.exists(OutputFolder) == FALSE) {
  dir.create(OutputFolder, recursive=TRUE)
}

2. Data Upload

  • Raw data produced by Salmon alignment
  • Experimental design: metadata for the samples belonging to the dataset
  • Gene annotation: metadata for the genes quantified in the dataset. Produced by AnnotationBiomart markdown

2.1 Read Count Matrix

RawCounts <- read.table(params$CountFile, sep='\t', header=TRUE)
colSums(RawCounts[,-1])
##        MTR_044_S1_CTL        MTR_045_S2_CTL        MTR_046_S3_CTL 
##              36971017              32622638              30941822 
##        MTR_047_S4_CTL      MTR_048_S65_DMSO       MTR_049_S5_DMSO 
##              33271330              44654465              34403338 
##       MTR_050_S6_DMSO      MTR_051_S66_DMSO   MTR_052_S7_AhHyd_Ag 
##              30998701              37163045              36062980 
##   MTR_053_S8_AhHyd_Ag   MTR_054_S9_AhHyd_Ag  MTR_055_S10_AhHyd_Ag 
##              40456682              36425685              37721544 
## MTR_056_S11_AhHyd_Inh MTR_057_S12_AhHyd_Inh MTR_058_S13_AhHyd_Inh 
##              33460513              40430519              35792717 
## MTR_059_S14_AhHyd_Inh    MTR_060_S15_Ret_Ag    MTR_061_S16_Ret_Ag 
##              39276645              46653871              38693617 
##    MTR_062_S17_Ret_Ag    MTR_063_S18_Ret_Ag   MTR_064_S19_Ret_Inh 
##              33350550              36431630              36625511 
##   MTR_065_S20_Ret_Inh   MTR_066_S21_Ret_Inh   MTR_067_S22_Ret_Inh 
##              44312184              42803898              42872144 
##   MTR_068_S23_Andr_Ag   MTR_069_S24_Andr_Ag   MTR_070_S25_Andr_Ag 
##              33979288              42358256              32402136 
##   MTR_071_S26_Andr_Ag  MTR_072_S27_Andr_Inh  MTR_073_S28_Andr_Inh 
##              41478275              34319118              37774460 
##  MTR_074_S29_Andr_Inh  MTR_075_S30_Andr_Inh   MTR_076_S31_LivX_Ag 
##              34812776              31094550              41533668 
##   MTR_077_S32_LivX_Ag   MTR_078_S33_LivX_Ag   MTR_079_S34_LivX_Ag 
##              57310375              40088536              46083880 
##  MTR_080_S35_LivX_Inh  MTR_081_S36_LivX_Inh  MTR_082_S37_LivX_Inh 
##              37473645              45188190              34883395 
##  MTR_083_S38_LivX_Inh     MTR_084_S39_GC_Ag     MTR_085_S40_GC_Ag 
##              32501008              51167583              40259649 
##     MTR_086_S41_GC_Ag     MTR_087_S42_GC_Ag    MTR_088_S43_GC_Inh 
##              42254100              46766823              38979002 
##    MTR_089_S44_GC_Inh    MTR_090_S45_GC_Inh    MTR_091_S46_GC_Inh 
##              42945742              48345893              47100946 
##   MTR_092_S47_Estr_Ag   MTR_093_S48_Estr_Ag   MTR_094_S49_Estr_Ag 
##              35115861              37432940              40739139 
##   MTR_095_S50_Estr_Ag  MTR_096_S51_Estr_Inh  MTR_097_S52_Estr_Inh 
##              35329883              33503939              36077732 
##  MTR_098_S53_Estr_Inh  MTR_099_S54_Estr_Inh   MTR_100_S55_Thyr_Ag 
##              33287732              44153893              45777783 
##   MTR_101_S56_Thyr_Ag   MTR_102_S57_Thyr_Ag   MTR_103_S58_Thyr_Ag 
##              46353379              49410918              34016782 
##  MTR_104_S59_Thyr_Inh  MTR_105_S60_Thyr_Inh  MTR_106_S61_Thyr_Inh 
##              34924702              37992484              37040271 
##  MTR_107_S62_Thyr_Inh      MTR_108_S63_DMSO      MTR_109_S64_DMSO 
##              41024958              46972571              41642306

2.2 Design Matrix

Design <- read.table(params$DesignFile, sep='\t', header=TRUE)
str(Design)
## 'data.frame':    66 obs. of  19 variables:
##  $ InternalUniqueID : chr  "MTR_044_S1" "MTR_045_S2" "MTR_046_S3" "MTR_047_S4" ...
##  $ HRID             : chr  "MTR_044_S1_CTL" "MTR_045_S2_CTL" "MTR_046_S3_CTL" "MTR_047_S4_CTL" ...
##  $ Subject          : chr  "CTL04" "CTL04" "CTL04" "CTL04" ...
##  $ Specimen         : chr  "CorticalOrganoid" "CorticalOrganoid" "CorticalOrganoid" "CorticalOrganoid" ...
##  $ Condition        : chr  "CTL" "CTL" "CTL" "CTL" ...
##  $ Treatment        : chr  "CTL" "CTL" "CTL" "CTL" ...
##  $ Pathway          : chr  "None" "None" "None" "None" ...
##  $ Type             : chr  "CTL" "CTL" "CTL" "CTL" ...
##  $ Line             : chr  "CTL04E" "CTL04E" "CTL04E" "CTL04E" ...
##  $ Sex              : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Project          : chr  "ENDpoiNTs" "ENDpoiNTs" "ENDpoiNTs" "ENDpoiNTs" ...
##  $ Seq.run          : chr  "20221018_27" "20221018_27" "20221018_27" "20221018_27" ...
##  $ FASTQ.R1         : logi  NA NA NA NA NA NA ...
##  $ FASTQ.R2         : logi  NA NA NA NA NA NA ...
##  $ RequestedCoverage: chr  "35M" "35M" "35M" "35M" ...
##  $ ExperimentCode   : logi  NA NA NA NA NA NA ...
##  $ SeqApproach      : chr  "PairedEnd" "PairedEnd" "PairedEnd" "PairedEnd" ...
##  $ RNASelection     : chr  "PolyA" "PolyA" "PolyA" "PolyA" ...
##  $ SeqPlatform      : chr  "Illumina" "Illumina" "Illumina" "Illumina" ...
Design$Sex <- rep("F", nrow(Design))
as.data.frame(lapply(Design, factor)) %>% 
  DT::datatable(class='hover', rownames=FALSE, caption='Sample identity and attributes', 
                filter='top', escape=TRUE, extension='Buttons', 
                options=list(pageLength=10, dom='Bfrtip', 
                             columnDefs=list(list(className='dt-center', targets=c(5:(length(Design)-1)),
                                                  visible=FALSE)),  #as if indexing starts from 0
                             buttons=list(I('colvis'), c('csv', 'excel')))
                )

The dataset is structured in 60237 genes measured in 66 samples.

2.3 Gene Annotation

if (is.null(params$GeneAnnotation)==FALSE){
  GeneAnnotation <- read.table(params$GeneAnnotationFile, sep='\t', header=TRUE)
  if(identical((RawCounts$Gene), GeneAnnotation$Gene)==FALSE){
    stop('ERROR! Gene order in count matrix and annotation matrix is not consistent. \n Please specify a data frame with the correct order.')
  }
}
if ('hgnc_symbol' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, HGNCSymbol=hgnc_symbol)
}

if ('external_gene_name' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, GeneName=external_gene_name)
}

if ('gene_biotype' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, GeneBiotype=gene_biotype)
}

if ('chromosome_name' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, Chr=chromosome_name)
}

if ('start_position' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, Start=start_position)
}

if ('end_position' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, End=end_position)
}

3. Raw Counts Distribution

Density plot for Raw Read Counts distribution: log2 values for cpm are calculated for raw read counts (before any pre-processing steps) and visualized by density plot.

DensityRaw <- readCountDensityplot(countTable=RawCounts, GeneCol ='Gene', dgelist=FALSE, 
                                   plotTitle='Raw Counts before filtering', prior.count=0.25, adjustment=0.5)
DensityRaw

4. Generation of Summarized Experiment

if(identical(colnames(RawCounts)[-1], Design$HRID)==FALSE){
    stop('ERROR! Gene order in count matrix and annotation matrix is not consistent. \n 
         Please specify a data frame with the correct order.')
  }
SE <- createSE(countTable=RawCounts, sampleMeta=Design, geneMeta=GeneAnnotation, 
               roundCounts=TRUE, showDuplicates=TRUE)
## No duplicated genes to remove

SE contains information about 60237 genes in 66 samples.


5. Removal of unwanted Biotypes

The purpose of this step is to discard RNA species that are not wanted in the library and can cause biases in the analysis.

  • The filtering is based on gene biotype from Biomart retrieved-annotation
  • Only protein-coding are selected for downstream analyses.
  • A diagnostic plot is produced to evaluate
    • the % of reads associated to the most expressed gene before the biotype selection
    • the % of removed reads after biotype selection

5.1 Biotype-selected SE

SE_Bio <- biotypeSelectSE(SE, lncRNA=FALSE, otherBios=NULL, showRemoved=TRUE) #only protein-coding genes are selected.
## UPDATED library.size slot
## A total of 40337 rows out of 60237 were discarded, belonging to the following biotypes:
## 
##                          IG_C_gene                    IG_C_pseudogene 
##                                 14                                  9 
##                          IG_D_gene                          IG_J_gene 
##                                 30                                 18 
##                    IG_J_pseudogene                      IG_pseudogene 
##                                  3                                  1 
##                          IG_V_gene                    IG_V_pseudogene 
##                                143                                185 
##                             lncRNA                              miRNA 
##                              16852                               1828 
##                           misc_RNA                            Mt_rRNA 
##                               2160                                  2 
##                            Mt_tRNA             polymorphic_pseudogene 
##                                 22                                 49 
##               processed_pseudogene                         pseudogene 
##                              10136                                 18 
##                           ribozyme                               rRNA 
##                                  8                                 26 
##                    rRNA_pseudogene                             scaRNA 
##                                486                                 48 
##                              scRNA                             snoRNA 
##                                  1                                925 
##                              snRNA                               sRNA 
##                               1836                                  5 
##                                TEC                          TR_C_gene 
##                               1048                                  6 
##                          TR_D_gene                          TR_J_gene 
##                                  4                                 78 
##                    TR_J_pseudogene                          TR_V_gene 
##                                  4                                106 
##                    TR_V_pseudogene   transcribed_processed_pseudogene 
##                                 33                                498 
##     transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene 
##                                138                                937 
##    translated_processed_pseudogene  translated_unprocessed_pseudogene 
##                                  2                                  1 
##                 unitary_pseudogene             unprocessed_pseudogene 
##                                 97                               2579 
##                          vault_RNA 
##                                  1
## A total of 19900 rows out of 60237 were kept, belonging to the following biotypes:
## 
## protein_coding 
##          19900

RemovedGenes <-  rowData(SE)[!(rownames(rowData(SE)) %in% rownames(rowData(SE_Bio))), ]

After the gene selection based on biotypes (only protein-coding genes are selected), the dataset is structured in 19900 genes measured in 66 samples. 40337 have been discarded.

5.2 Diagnostic Barplots

Reads associated to most expressed gene
plotCompareMax(SE, SE_Bio, PlotTitle=NULL, Interactive=TRUE)
## Genes in the first dataset:  60237
## Genes in the second dataset:  19900
Removed Reads after biotype selection
plotComparePerc(SE, SE_Bio, PlotTitle=NULL, Interactive=TRUE)
## Genes in the first dataset:  60237
## Genes in the second dataset:  19900


6. Gene expression filtering and SE normalization

6.1 Setting gene filtering thresholds

CpmFilt <- params$CpmFilt
if(is.numeric(CpmFilt)==FALSE){
  stop('ERROR! Cpm threshold for gene filtering has a non-numeric value')
}

SampleFilt <- ifelse(params$SampleFilt=='Default', dim(SE_Bio)[2]/2, params$SampleFilt)
if(is.numeric(SampleFilt)==FALSE){
  stop('ERROR! Sample threshold for gene filtering has a non-numeric value')
}

Genes having an expression lower than 2 in at least 4 samples are filtered out.

6.2 Filter out low expressed genes

  • Compute temporary cpm (without TMM normalization, not stored in SE)
  • Filter out lowly-expressed genes
SE_Bio$lib.size
##        MTR_044_S1_CTL        MTR_045_S2_CTL        MTR_046_S3_CTL 
##              35896092              31708728              30084298 
##        MTR_047_S4_CTL      MTR_048_S65_DMSO       MTR_049_S5_DMSO 
##              32298211              43316736              33355586 
##       MTR_050_S6_DMSO      MTR_051_S66_DMSO   MTR_052_S7_AhHyd_Ag 
##              30026932              36026174              34905456 
##   MTR_053_S8_AhHyd_Ag   MTR_054_S9_AhHyd_Ag  MTR_055_S10_AhHyd_Ag 
##              39115824              35254183              36508256 
## MTR_056_S11_AhHyd_Inh MTR_057_S12_AhHyd_Inh MTR_058_S13_AhHyd_Inh 
##              32446046              39181238              34663693 
## MTR_059_S14_AhHyd_Inh    MTR_060_S15_Ret_Ag    MTR_061_S16_Ret_Ag 
##              38124739              45105841              37421078 
##    MTR_062_S17_Ret_Ag    MTR_063_S18_Ret_Ag   MTR_064_S19_Ret_Inh 
##              32185720              35203919              35563706 
##   MTR_065_S20_Ret_Inh   MTR_066_S21_Ret_Inh   MTR_067_S22_Ret_Inh 
##              42993502              41486813              41593609 
##   MTR_068_S23_Andr_Ag   MTR_069_S24_Andr_Ag   MTR_070_S25_Andr_Ag 
##              32815429              40906632              31208496 
##   MTR_071_S26_Andr_Ag  MTR_072_S27_Andr_Inh  MTR_073_S28_Andr_Inh 
##              40030676              33206709              36578304 
##  MTR_074_S29_Andr_Inh  MTR_075_S30_Andr_Inh   MTR_076_S31_LivX_Ag 
##              33775034              30169516              40306389 
##   MTR_077_S32_LivX_Ag   MTR_078_S33_LivX_Ag   MTR_079_S34_LivX_Ag 
##              55537385              38873337              44661327 
##  MTR_080_S35_LivX_Inh  MTR_081_S36_LivX_Inh  MTR_082_S37_LivX_Inh 
##              36255300              43640070              33774429 
##  MTR_083_S38_LivX_Inh     MTR_084_S39_GC_Ag     MTR_085_S40_GC_Ag 
##              31448744              49615558              39058430 
##     MTR_086_S41_GC_Ag     MTR_087_S42_GC_Ag    MTR_088_S43_GC_Inh 
##              40949271              45343061              37632428 
##    MTR_089_S44_GC_Inh    MTR_090_S45_GC_Inh    MTR_091_S46_GC_Inh 
##              41495325              46713448              45482531 
##   MTR_092_S47_Estr_Ag   MTR_093_S48_Estr_Ag   MTR_094_S49_Estr_Ag 
##              33976888              36201983              39420602 
##   MTR_095_S50_Estr_Ag  MTR_096_S51_Estr_Inh  MTR_097_S52_Estr_Inh 
##              34215705              32466391              34977751 
##  MTR_098_S53_Estr_Inh  MTR_099_S54_Estr_Inh   MTR_100_S55_Thyr_Ag 
##              32309780              42816826              44343983 
##   MTR_101_S56_Thyr_Ag   MTR_102_S57_Thyr_Ag   MTR_103_S58_Thyr_Ag 
##              44983604              47883046              32958622 
##  MTR_104_S59_Thyr_Inh  MTR_105_S60_Thyr_Inh  MTR_106_S61_Thyr_Inh 
##              33828268              36810075              35816266 
##  MTR_107_S62_Thyr_Inh      MTR_108_S63_DMSO      MTR_109_S64_DMSO 
##              39757302              45479386              40315145
SE_Filt <- filterSE(SE_Bio, cpmTh=CpmFilt, sampleTh=SampleFilt)
## UPDATED library.size slot
SE_Filt$lib.size
##        MTR_044_S1_CTL        MTR_045_S2_CTL        MTR_046_S3_CTL 
##              35858188              31675401              30051546 
##        MTR_047_S4_CTL      MTR_048_S65_DMSO       MTR_049_S5_DMSO 
##              32262934              43277531              33323969 
##       MTR_050_S6_DMSO      MTR_051_S66_DMSO   MTR_052_S7_AhHyd_Ag 
##              29997233              35993749              34869576 
##   MTR_053_S8_AhHyd_Ag   MTR_054_S9_AhHyd_Ag  MTR_055_S10_AhHyd_Ag 
##              39080570              35219025              36472569 
## MTR_056_S11_AhHyd_Inh MTR_057_S12_AhHyd_Inh MTR_058_S13_AhHyd_Inh 
##              32408754              39137490              34624846 
## MTR_059_S14_AhHyd_Inh    MTR_060_S15_Ret_Ag    MTR_061_S16_Ret_Ag 
##              38084234              45043395              37370112 
##    MTR_062_S17_Ret_Ag    MTR_063_S18_Ret_Ag   MTR_064_S19_Ret_Inh 
##              32140145              35156646              35523100 
##   MTR_065_S20_Ret_Inh   MTR_066_S21_Ret_Inh   MTR_067_S22_Ret_Inh 
##              42942883              41435606              41545798 
##   MTR_068_S23_Andr_Ag   MTR_069_S24_Andr_Ag   MTR_070_S25_Andr_Ag 
##              32782195              40863317              31176721 
##   MTR_071_S26_Andr_Ag  MTR_072_S27_Andr_Inh  MTR_073_S28_Andr_Inh 
##              39988943              33169609              36538748 
##  MTR_074_S29_Andr_Inh  MTR_075_S30_Andr_Inh   MTR_076_S31_LivX_Ag 
##              33740241              30139114              40265748 
##   MTR_077_S32_LivX_Ag   MTR_078_S33_LivX_Ag   MTR_079_S34_LivX_Ag 
##              55477111              38833696              44614286 
##  MTR_080_S35_LivX_Inh  MTR_081_S36_LivX_Inh  MTR_082_S37_LivX_Inh 
##              36217428              43597227              33746634 
##  MTR_083_S38_LivX_Inh     MTR_084_S39_GC_Ag     MTR_085_S40_GC_Ag 
##              31421552              49563508              39016368 
##     MTR_086_S41_GC_Ag     MTR_087_S42_GC_Ag    MTR_088_S43_GC_Inh 
##              40904006              45293963              37598307 
##    MTR_089_S44_GC_Inh    MTR_090_S45_GC_Inh    MTR_091_S46_GC_Inh 
##              41460144              46674457              45442403 
##   MTR_092_S47_Estr_Ag   MTR_093_S48_Estr_Ag   MTR_094_S49_Estr_Ag 
##              33945232              36166990              39384322 
##   MTR_095_S50_Estr_Ag  MTR_096_S51_Estr_Inh  MTR_097_S52_Estr_Inh 
##              34183853              32435048              34943076 
##  MTR_098_S53_Estr_Inh  MTR_099_S54_Estr_Inh   MTR_100_S55_Thyr_Ag 
##              32281616              42777947              44299285 
##   MTR_101_S56_Thyr_Ag   MTR_102_S57_Thyr_Ag   MTR_103_S58_Thyr_Ag 
##              44941529              47832333              32924779 
##  MTR_104_S59_Thyr_Inh  MTR_105_S60_Thyr_Inh  MTR_106_S61_Thyr_Inh 
##              33797840              36776601              35781652 
##  MTR_107_S62_Thyr_Inh      MTR_108_S63_DMSO      MTR_109_S64_DMSO 
##              39722342              45436275              40275796

6.3 Normalize

  • Re-calculate library size and calculate TMM
  • Calculate cpm and logCpm (with normalization) and store in SE_Filt
SE_Filt <- normTmmSE(SE_Filt, useNormFactors=TRUE, priorCount=0.25)  

After filtering out lowly-expressed genes, the dataset is structured in 13532 genes measured in 66 samples.


7. Diagnostic barplots on normalized data

7.1 Barplot for library size

  • Barplot showing for each sample the initial library size; plot is centered on the mean value
  • Barplot with the same visualization after biotype selection and filtering on Cpm threshold
Library size before filtering
librarySizeBarplot(SE, PlotTitle='Before Biotype Selection', Interactive=TRUE)
Library size after filtering
#Extract ylim values from static plot to set them in the post filtering plot
b1 <- librarySizeBarplot(SE, PlotTitle='Before Biotype Selection', Interactive=FALSE)
lim_y <- layer_scales(b1)$y$range$range

librarySizeBarplot(SE_Filt, PlotTitle='After Biotype Selection and Cpm Filtering', Interactive=TRUE, ylim=lim_y)

7.2 Barplot for TMM and expressed genes

  • Barplot showing for each sample the TMM value as calculated by edgeR and used in the normalization
  • Barplot represents in each sample the number of ‘expressed’ genes. The evaluation is done on SE before the gene filtering step and applying the specified Cpm threshold separately to each sample.
TMM Barplot
tMMBarplot(SE_Filt, Interactive=TRUE)
Genes with Cpm higher than threshold
expressedGeneBarplot(SE, PlotTitle='Before Filtering', cpmth=CpmFilt, Interactive=TRUE)
## WARNING: SE object has no "cpm" assay, the function will compute it
ggsave(filename=paste0(OutputFolder, '/ExpressedGenes.pdf'), 
       plot=expressedGeneBarplot(SE, PlotTitle='Before Filtering', cpmth=CpmFilt, Interactive=FALSE),
       device='pdf', width=10, height=5)
## WARNING: SE object has no "cpm" assay, the function will compute it


8. Density plot for Count distribution after filtering

readCountDensityplot(assays(SE_Filt)$counts, plotTitle='Filtered Dataset')

9. Sample-to-sample correlation

Correlation matrix across samples calculated on the basis of the Spearman correlation.

# Set heatmap size on the basis of the number of samples
Width <- 7 + (dim(SE_Filt)[2]- 8) * 0.24
Width <- ifelse(Width<=14, Width, 14)
Height <- 5 + (dim(SE_Filt)[2]- 8) * 0.22

10. Principal Component Analysis

Cols <- c('DMSO' = 'grey30', 'CTL' = 'azure3', 
                 'AhHyd_Ag'='#F8766D', 'AhHyd_Inh'='#F8766D50',
                 'Andr_Ag'='#fccb17', 'Andr_Inh'='#C49A0050',  
                 "Estr_Ag"= '#53B400', "Estr_Inh"= '#53B40050', 
                 'GC_Ag' = '#00C094', 'GC_Inh' = '#00C09450',
                 'LivX_Ag' = '#00B6EB', 'LivX_Inh' = '#00B6EB50', 
                 'Ret_Ag' = '#A58AFF', 'Ret_Inh' = '#A58AFF50', 
                 'Thyr_Ag' = '#FB61D7', 'Thyr_Inh' = '#FB61D750'
                 )
pcaSE(SE_Filt, PlotTitle='First-Second Component', components=c(1,2), Interactive=TRUE, colVec = Cols)
pcaSE(SE_Filt, PlotTitle='First-Second Component', components=c(1,2), Interactive=TRUE,condition = "Pathway")
pcaSE3d(SE_Filt, plotTitle='First-Third Components', colVec=Cols)

11. Savings

Saved objects

  • Intial SE without any filtering
  • SE_Bio after biotype selection, but before expressed gene selection and normalization
saveRDS(SE, paste0(OutputFolder, '/', Dataset, 'SE.rds')) 
saveRDS(SE_Bio, paste0(OutputFolder, '/', Dataset, 'SE_Bio.rds')) 

SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(OutputFolder, '/', Dataset, 'ExploratoryAnalysisWorkspace_beforeSelection.RData'))
SessionInfo
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggplot2_3.4.1               SummarizedExperiment_1.28.0
##  [3] Biobase_2.58.0              GenomicRanges_1.50.2       
##  [5] GenomeInfoDb_1.34.9         IRanges_2.32.0             
##  [7] S4Vectors_0.36.1            BiocGenerics_0.44.0        
##  [9] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## [11] gridExtra_2.3               DT_0.27                    
## [13] RNASeqBulkExploratory_0.2.1
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.5             sass_0.4.5             edgeR_3.40.2          
##  [4] tidyr_1.3.0            jsonlite_1.8.4         viridisLite_0.4.1     
##  [7] bslib_0.4.2            highr_0.10             GenomeInfoDbData_1.2.9
## [10] ggrepel_0.9.3          yaml_2.3.7             pillar_1.8.1          
## [13] lattice_0.20-45        glue_1.6.2             limma_3.54.1          
## [16] digest_0.6.31          RColorBrewer_1.1-3     XVector_0.38.0        
## [19] colorspace_2.1-0       htmltools_0.5.4        Matrix_1.5-3          
## [22] pkgconfig_2.0.3        pheatmap_1.0.12        zlibbioc_1.44.0       
## [25] purrr_1.0.1            scales_1.2.1           tibble_3.2.1          
## [28] generics_0.1.3         farver_2.1.1           ellipsis_0.3.2        
## [31] cachem_1.0.7           withr_2.5.0            lazyeval_0.2.2        
## [34] cli_3.6.1              magrittr_2.0.3         evaluate_0.20         
## [37] fansi_1.0.4            textshaping_0.3.6      tools_4.2.1           
## [40] data.table_1.14.8      lifecycle_1.0.3        stringr_1.5.0         
## [43] plotly_4.10.1          munsell_0.5.0          locfit_1.5-9.7        
## [46] DelayedArray_0.24.0    compiler_4.2.1         jquerylib_0.1.4       
## [49] systemfonts_1.0.4      rlang_1.1.1            grid_4.2.1            
## [52] RCurl_1.98-1.10        rstudioapi_0.14        htmlwidgets_1.6.1     
## [55] crosstalk_1.2.0        bitops_1.0-7           labeling_0.4.2        
## [58] rmarkdown_2.20         gtable_0.3.1           R6_2.5.1              
## [61] knitr_1.42             dplyr_1.1.0            fastmap_1.1.1         
## [64] utf8_1.2.3             ragg_1.2.3             stringi_1.7.12        
## [67] Rcpp_1.0.10            vctrs_0.6.2            tidyselect_1.2.0      
## [70] xfun_0.37
Date
## [1] "Fri Jul 18 13:10:03 2025"