Exploratory analysis on MaleLine

1. Environment Set Up

knitr::opts_chunk$set(echo = TRUE, warning=FALSE)

Values of RMarkdown parameters

for (i in 1:length(params))
  print(paste('Parameter:', names(params)[i], ' - Value:', params[[i]], '- Class:', class(params[[i]])))
## [1] "Parameter: Dataset  - Value: MaleLine - Class: character"
## [1] "Parameter: CountFile  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/CTL08/Input/Counts_CTL08.txt - Class: character"
## [1] "Parameter: DesignFile  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/CTL08/Input/Design_CTL08.txt - Class: character"
## [1] "Parameter: GeneAnnotationFile  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/GeneAnnotation/AnnotationEns101.txt - Class: character"
## [1] "Parameter: OutputFolder  - Value: ~/DataDir/bulkRNASeq/3.DataExploration/CTL08/Output/AfterSamplesSelection/ - Class: character"
## [1] "Parameter: CpmFilt  - Value: 2 - Class: integer"
## [1] "Parameter: SampleFilt  - Value: 4 - Class: integer"
library(RNASeqBulkExploratory)  #Our Package
library(DT)
library(gridExtra)
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(ggplot2)
Dataset <- params$Dataset
OutputFolder <- params$OutputFolder

if (dir.exists(OutputFolder) == FALSE) {
  dir.create(OutputFolder, recursive=FALSE)
}

2. Data Upload

  • Raw data produced by Salmon alignment
  • Experimental design: metadata for the samples belonging to the dataset
  • Gene annotation: metadata for the genes quantified in the dataset. Produced by AnnotationBiomart markdown

2.1 Read Count Matrix

RawCounts <- read.table(params$CountFile, sep='\t', header=TRUE)
colSums(RawCounts[,-1])
##        S32829_H12_DMSO      S32830_H14_Ret_Ag      S32831_H15_Ret_Ag 
##               37082956               61416401               73119152 
##    S32833_H27_Thyr_Inh    S32837_H35_LivX_Inh    S32838_H36_LivX_Inh 
##               69032800               58373001               71150319 
##     S32839_H38_LivX_Ag     S32840_H39_LivX_Ag     S32841_H41_Estr_Ag 
##               55444838               37454192               65763152 
##     S32842_H42_Estr_Ag    S32843_H45_Estr_Inh     S32845_H51_Andr_Ag 
##               44051390               56822948               46639517 
##    S32846_H52_Andr_Inh    S32847_H53_Andr_Inh    S32848_H54_Andr_Inh 
##               88089786               66870354               31812746 
##      S32849_H60_GC_Inh   S32853_H68_AhHyd_Inh   S32854_H69_AhHyd_Inh 
##               79051230               81199887               28984806 
##     S32857_H22_Thyr_Ag    S32858_H25_Thyr_Inh         S32860_H9_DMSO 
##               53315691               71923960               56348596 
##     S32861_H17_Ret_Inh     S32862_H18_Ret_Inh      S32864_H13_Ret_Ag 
##               70570336               36101929               27869397 
##     S32865_H23_Thyr_Ag    S32866_H44_Estr_Inh       S35868_MTR1_DMSO 
##               37403761               72123124               60177480 
##       S35869_MTR2_DMSO       S35870_MTR3_DMSO    S35871_MTR4_Ret_Inh 
##               61111164               35735992               52204714 
##   S35872_MTR5_Thyr_Inh   S35875_MTR8_LivX_Inh    S35876_MTR9_LivX_Ag 
##               49882862               58509333               54082673 
##  S35877_MTR10_Estr_Inh   S35879_MTR12_Andr_Ag    S35880_MTR13_GC_Inh 
##               58124217               42293168               42344517 
##    S35881_MTR14_GC_Inh S35882_MTR15_AhHyd_Inh          MTR16_S34_CTL 
##               57055454               54372100               30136495 
##          MTR17_S38_CTL          MTR19_S41_CTL          MTR20_S48_CTL 
##               36842038               39498077               24402432 
##          MTR21_S29_CTL         MTR22_S46_DMSO         MTR23_S28_DMSO 
##               40045196               37597559               32806053 
##         MTR24_S51_DMSO         MTR25_S43_DMSO         MTR26_S33_DMSO 
##               29432084               33619738               29950423 
##      MTR27_S53_Thyr_Ag      MTR28_S50_Thyr_Ag      MTR29_S30_Andr_Ag 
##               37265674               37174276               39897118 
##      MTR30_S42_Andr_Ag        MTR31_S47_GC_Ag        MTR32_S35_GC_Ag 
##               38633066               40727082               32512492 
##        MTR33_S37_GC_Ag     MTR37_S52_AhHyd_Ag     MTR38_S44_AhHyd_Ag 
##               24043947               34696132               35912480 
##     MTR39_S49_AhHyd_Ag 
##               33573139

2.2 Design Matrix

Design <- read.table(params$DesignFile, sep='\t', header=TRUE)
str(Design)
## 'data.frame':    58 obs. of  18 variables:
##  $ InternalUniqueID : chr  "Sample_S32829_H12" "Sample_S32830_H14" "Sample_S32831_H15" "Sample_S32833_H27" ...
##  $ HRID             : chr  "S32829_H12_DMSO" "S32830_H14_Ret_Ag" "S32831_H15_Ret_Ag" "S32833_H27_Thyr_Inh" ...
##  $ Subject          : chr  "CTL08" "CTL08" "CTL08" "CTL08" ...
##  $ Specimen         : chr  "CorticalOrganoid" "CorticalOrganoid" "CorticalOrganoid" "CorticalOrganoid" ...
##  $ Condition        : chr  "DMSO" "Ret_Ag" "Ret_Ag" "Thyr_Inh" ...
##  $ Treatment        : chr  "DMSO" "Ret_Ag" "Ret_Ag" "Thyr_Inh" ...
##  $ Pathway          : chr  "No" "Ret" "Ret" "Thyr" ...
##  $ Line             : chr  "CTL08A" "CTL08A" "CTL08A" "CTL08A" ...
##  $ Sex              : chr  "M" "M" "M" "M" ...
##  $ Project          : chr  "ENDpoiNTs" "ENDpoiNTs" "ENDpoiNTs" "ENDpoiNTs" ...
##  $ SeqRun           : int  20210310 20210310 20210310 20210310 20210310 20210310 20210310 20210310 20210310 20210310 ...
##  $ FASTQ.R1         : chr  "/hpcnfs/techunits/genomics/PublicData/Testa/mrigoli/FASTQ/210310_A00302_0276_AHYCFVDSXY/Sample_S32829_H12" "/hpcnfs/techunits/genomics/PublicData/Testa/mrigoli/FASTQ/210310_A00302_0276_AHYCFVDSXY/Sample_S32830_H14" "/hpcnfs/techunits/genomics/PublicData/Testa/mrigoli/FASTQ/210310_A00302_0276_AHYCFVDSXY/Sample_S32831_H15" "/hpcnfs/techunits/genomics/PublicData/Testa/mrigoli/FASTQ/210310_A00302_0276_AHYCFVDSXY/Sample_S32833_H27" ...
##  $ FASTQ.R2         : chr  "/hpcnfs/techunits/genomics/PublicData/Testa/mrigoli/FASTQ/210310_A00302_0276_AHYCFVDSXY/Sample_S32829_H12" "/hpcnfs/techunits/genomics/PublicData/Testa/mrigoli/FASTQ/210310_A00302_0276_AHYCFVDSXY/Sample_S32830_H14" "/hpcnfs/techunits/genomics/PublicData/Testa/mrigoli/FASTQ/210310_A00302_0276_AHYCFVDSXY/Sample_S32831_H15" "/hpcnfs/techunits/genomics/PublicData/Testa/mrigoli/FASTQ/210310_A00302_0276_AHYCFVDSXY/Sample_S32833_H27" ...
##  $ RequestedCoverage: chr  "35M" "35M" "35M" "35M" ...
##  $ ExperimentCode   : logi  NA NA NA NA NA NA ...
##  $ SeqApproach      : chr  "PairedEnd" "PairedEnd" "PairedEnd" "PairedEnd" ...
##  $ RNASelection     : chr  "PolyA" "PolyA" "PolyA" "PolyA" ...
##  $ SeqPlatform      : chr  "Illumina" "Illumina" "Illumina" "Illumina" ...

2.3 Samples Selection

Samples_to_remove <- c("S32843_H45_Estr_Inh", "S32866_H44_Estr_Inh", "S35868_MTR1_DMSO", "S35869_MTR2_DMSO", "S35871_MTR4_Ret_Inh", "S35877_MTR10_Estr_Inh", "S35880_MTR13_GC_Inh")

#new Raw Counts
RawCounts <- RawCounts[, !(colnames(RawCounts)%in% Samples_to_remove)]

#new Design Matrix
Design <-Design[!(Design$HRID %in% Samples_to_remove), ] 
row.names(Design) <- NULL #to reset row names after removing rows
as.data.frame(lapply(Design, factor)) %>% 
  DT::datatable(class='hover', rownames=FALSE, caption='Sample identity and attributes', 
                filter='top', escape=TRUE, extension='Buttons', 
                options=list(pageLength=10, dom='Bfrtip', 
                             columnDefs=list(list(className='dt-center', targets=c(5:(length(Design)-1)),
                                                  visible=FALSE)),  #as if indexing starts from 0
                             buttons=list(I('colvis'), c('csv', 'excel')))
                )

The dataset is structured in 60237 genes measured in 51 samples.

2.3 Gene Annotation

if (is.null(params$GeneAnnotation)==FALSE){
  GeneAnnotation <- read.table(params$GeneAnnotationFile, sep='\t', header=TRUE)
  if(identical((RawCounts$Gene), GeneAnnotation$Gene)==FALSE){
    stop('ERROR! Gene order in count matrix and annotation matrix is not consistent. \n Please specify a data frame with the correct order.')
  }
}
if ('hgnc_symbol' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, HGNCSymbol=hgnc_symbol)
}

if ('external_gene_name' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, GeneName=external_gene_name)
}

if ('gene_biotype' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, GeneBiotype=gene_biotype)
}

if ('chromosome_name' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, Chr=chromosome_name)
}

if ('start_position' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, Start=start_position)
}

if ('end_position' %in% colnames(GeneAnnotation)){
  GeneAnnotation <- dplyr::rename(GeneAnnotation, End=end_position)
}

3. Raw Counts Distribution

Density plot for Raw Read Counts distribution: log2 values for cpm are calculated for raw read counts (before any pre-processing steps) and visualized by density plot.

DensityRaw <- readCountDensityplot(countTable=RawCounts, GeneCol ='Gene', dgelist=FALSE, 
                                   plotTitle='Raw Counts before filtering', prior.count=0.25, adjustment=0.5)
DensityRaw

4. Generation of Summarized Experiment

if(identical(colnames(RawCounts)[-1], Design$HRID)==FALSE){
    stop('ERROR! Gene order in count matrix and annotation matrix is not consistent. \n 
         Please specify a data frame with the correct order.')
  }
SE <- createSE(countTable=RawCounts, sampleMeta=Design, geneMeta=GeneAnnotation, 
               roundCounts=TRUE, showDuplicates=TRUE)
## No duplicated genes to remove

SE contains information about 60237 genes in 51 samples.


5. Removal of unwanted Biotypes

The purpose of this step is to discard RNA species that are not wanted in the library and can cause biases in the analysis.

  • The filtering is based on gene biotype from Biomart retrieved-annotation
  • Only protein-coding are selected for downstream analyses.
  • A diagnostic plot is produced to evaluate
    • the % of reads associated to the most expressed gene before the biotype selection
    • the % of removed reads after biotype selection

5.1 Biotype-selected SE

SE_Bio <- biotypeSelectSE(SE, lncRNA=FALSE, otherBios=NULL, showRemoved=TRUE)
## UPDATED library.size slot
## A total of 40337 rows out of 60237 were discarded, belonging to the following biotypes:
## 
##                          IG_C_gene                    IG_C_pseudogene 
##                                 14                                  9 
##                          IG_D_gene                          IG_J_gene 
##                                 30                                 18 
##                    IG_J_pseudogene                      IG_pseudogene 
##                                  3                                  1 
##                          IG_V_gene                    IG_V_pseudogene 
##                                143                                185 
##                             lncRNA                              miRNA 
##                              16852                               1828 
##                           misc_RNA                            Mt_rRNA 
##                               2160                                  2 
##                            Mt_tRNA             polymorphic_pseudogene 
##                                 22                                 49 
##               processed_pseudogene                         pseudogene 
##                              10136                                 18 
##                           ribozyme                               rRNA 
##                                  8                                 26 
##                    rRNA_pseudogene                             scaRNA 
##                                486                                 48 
##                              scRNA                             snoRNA 
##                                  1                                925 
##                              snRNA                               sRNA 
##                               1836                                  5 
##                                TEC                          TR_C_gene 
##                               1048                                  6 
##                          TR_D_gene                          TR_J_gene 
##                                  4                                 78 
##                    TR_J_pseudogene                          TR_V_gene 
##                                  4                                106 
##                    TR_V_pseudogene   transcribed_processed_pseudogene 
##                                 33                                498 
##     transcribed_unitary_pseudogene transcribed_unprocessed_pseudogene 
##                                138                                937 
##    translated_processed_pseudogene  translated_unprocessed_pseudogene 
##                                  2                                  1 
##                 unitary_pseudogene             unprocessed_pseudogene 
##                                 97                               2579 
##                          vault_RNA 
##                                  1
## A total of 19900 rows out of 60237 were kept, belonging to the following biotypes:
## 
## protein_coding 
##          19900

RemovedGenes <-  rowData(SE)[!(rownames(rowData(SE)) %in% rownames(rowData(SE_Bio))), ]

After the gene selection based on biotypes, the dataset is structured in 19900 genes measured in 51 samples. 40337 have been discarded.

5.2 Diagnostic Barplots

Reads associated to most expressed gene
plotCompareMax(SE, SE_Bio, PlotTitle=NULL, Interactive=TRUE)
## Genes in the first dataset:  60237
## Genes in the second dataset:  19900
Removed Reads after biotype selection
plotComparePerc(SE, SE_Bio, PlotTitle=NULL, Interactive=TRUE)
## Genes in the first dataset:  60237
## Genes in the second dataset:  19900


6. Gene expression filtering and SE normalization

6.1 Setting gene filtering thresholds

CpmFilt <- params$CpmFilt
if(is.numeric(CpmFilt)==FALSE){
  stop('ERROR! Cpm threshold for gene filtering has a non-numeric value')
}

SampleFilt <- ifelse(params$SampleFilt=='Default', dim(SE_Bio)[2]/2, params$SampleFilt)
if(is.numeric(SampleFilt)==FALSE){
  stop('ERROR! Sample threshold for gene filtering has a non-numeric value')
}

Genes having an expression lower than 2 in at least 4 samples are filtered out.

6.2 Filter out low expressed genes

  • Compute temporary cpm (without TMM normalization, not stored in SE)
  • Filter out lowly-expressed genes
SE_Bio$lib.size
##        S32829_H12_DMSO      S32830_H14_Ret_Ag      S32831_H15_Ret_Ag 
##               35613673               59279739               70826142 
##    S32833_H27_Thyr_Inh    S32837_H35_LivX_Inh    S32838_H36_LivX_Inh 
##               67049867               56634424               68940854 
##     S32839_H38_LivX_Ag     S32840_H39_LivX_Ag     S32841_H41_Estr_Ag 
##               53964028               36461411               63518569 
##     S32842_H42_Estr_Ag     S32845_H51_Andr_Ag    S32846_H52_Andr_Inh 
##               42604865               45209919               85402393 
##    S32847_H53_Andr_Inh    S32848_H54_Andr_Inh      S32849_H60_GC_Inh 
##               64432453               30728121               76458966 
##   S32853_H68_AhHyd_Inh   S32854_H69_AhHyd_Inh     S32857_H22_Thyr_Ag 
##               79053007               28308197               51578374 
##    S32858_H25_Thyr_Inh         S32860_H9_DMSO     S32861_H17_Ret_Inh 
##               69682372               54069427               68212493 
##     S32862_H18_Ret_Inh      S32864_H13_Ret_Ag     S32865_H23_Thyr_Ag 
##               35022905               27114632               36213800 
##       S35870_MTR3_DMSO   S35872_MTR5_Thyr_Inh   S35875_MTR8_LivX_Inh 
##               33548410               48558405               56970827 
##    S35876_MTR9_LivX_Ag   S35879_MTR12_Andr_Ag    S35881_MTR14_GC_Inh 
##               52549588               40765101               55221358 
## S35882_MTR15_AhHyd_Inh          MTR16_S34_CTL          MTR17_S38_CTL 
##               52843891               29163508               35691834 
##          MTR19_S41_CTL          MTR20_S48_CTL          MTR21_S29_CTL 
##               38249382               23658993               38700856 
##         MTR22_S46_DMSO         MTR23_S28_DMSO         MTR24_S51_DMSO 
##               36339476               31756335               28625488 
##         MTR25_S43_DMSO         MTR26_S33_DMSO      MTR27_S53_Thyr_Ag 
##               32823794               29031507               36025706 
##      MTR28_S50_Thyr_Ag      MTR29_S30_Andr_Ag      MTR30_S42_Andr_Ag 
##               36156859               38661303               37547288 
##        MTR31_S47_GC_Ag        MTR32_S35_GC_Ag        MTR33_S37_GC_Ag 
##               39385341               31560063               23287199 
##     MTR37_S52_AhHyd_Ag     MTR38_S44_AhHyd_Ag     MTR39_S49_AhHyd_Ag 
##               33521688               34704497               32385629
SE_Filt <- filterSE(SE_Bio, cpmTh=CpmFilt, sampleTh=SampleFilt)
## UPDATED library.size slot
SE_Filt$lib.size
##        S32829_H12_DMSO      S32830_H14_Ret_Ag      S32831_H15_Ret_Ag 
##               35590131               59200110               70742749 
##    S32833_H27_Thyr_Inh    S32837_H35_LivX_Inh    S32838_H36_LivX_Inh 
##               66985181               56599178               68890696 
##     S32839_H38_LivX_Ag     S32840_H39_LivX_Ag     S32841_H41_Estr_Ag 
##               53928092               36436873               63458999 
##     S32842_H42_Estr_Ag     S32845_H51_Andr_Ag    S32846_H52_Andr_Inh 
##               42563847               45175305               85352784 
##    S32847_H53_Andr_Inh    S32848_H54_Andr_Inh      S32849_H60_GC_Inh 
##               64387992               30707135               76398131 
##   S32853_H68_AhHyd_Inh   S32854_H69_AhHyd_Inh     S32857_H22_Thyr_Ag 
##               78994951               28288738               51515650 
##    S32858_H25_Thyr_Inh         S32860_H9_DMSO     S32861_H17_Ret_Inh 
##               69576583               54031813               68165067 
##     S32862_H18_Ret_Inh      S32864_H13_Ret_Ag     S32865_H23_Thyr_Ag 
##               35000045               27084536               36181797 
##       S35870_MTR3_DMSO   S35872_MTR5_Thyr_Inh   S35875_MTR8_LivX_Inh 
##               33523110               48524440               56939850 
##    S35876_MTR9_LivX_Ag   S35879_MTR12_Andr_Ag    S35881_MTR14_GC_Inh 
##               52493679               40731658               55176056 
## S35882_MTR15_AhHyd_Inh          MTR16_S34_CTL          MTR17_S38_CTL 
##               52800281               29145491               35666587 
##          MTR19_S41_CTL          MTR20_S48_CTL          MTR21_S29_CTL 
##               38218535               23639831               38661501 
##         MTR22_S46_DMSO         MTR23_S28_DMSO         MTR24_S51_DMSO 
##               36308979               31725384               28600095 
##         MTR25_S43_DMSO         MTR26_S33_DMSO      MTR27_S53_Thyr_Ag 
##               32806523               29004261               35999172 
##      MTR28_S50_Thyr_Ag      MTR29_S30_Andr_Ag      MTR30_S42_Andr_Ag 
##               36135434               38630270               37516633 
##        MTR31_S47_GC_Ag        MTR32_S35_GC_Ag        MTR33_S37_GC_Ag 
##               39347787               31530964               23264868 
##     MTR37_S52_AhHyd_Ag     MTR38_S44_AhHyd_Ag     MTR39_S49_AhHyd_Ag 
##               33498815               34676956               32358540

6.3 Normalize

  • Re-calculate library size and calculate TMM
  • Calculate cpm and logCpm (with normalization) and store in SE_Filt
SE_Filt <- normTmmSE(SE_Filt, useNormFactors=TRUE, priorCount=0.25)  

After filtering out lowly-expressed genes, the dataset is structured in 14213 genes measured in 51 samples.


7. Diagnostic barplots on normalized data

7.1 Barplot for library size

  • Barplot showing for each sample the initial library size; plot is centered on the mean value
  • Barplot with the same visualization after biotype selection and filtering on Cpm threshold
Library size before filtering
librarySizeBarplot(SE, PlotTitle='Before Biotype Selection', Interactive=TRUE)
Library size after filtering
#Extract ylim values from static plot to set them in the post filtering plot
b1 <- librarySizeBarplot(SE, PlotTitle='Before Biotype Selection', Interactive=FALSE)
lim_y <- layer_scales(b1)$y$range$range

librarySizeBarplot(SE_Filt, PlotTitle='After Biotype Selection and Cpm Filtering', Interactive=TRUE, ylim=lim_y)

7.2 Barplot for TMM and expressed genes

  • Barplot showing for each sample the TMM value as calculated by edgeR and used in the normalization
  • Barplot represents in each sample the number of ‘expressed’ genes. The evaluation is done on SE before the gene filtering step and applying the specified Cpm threshold separately to each sample.
TMM Barplot
tMMBarplot(SE_Filt, Interactive=TRUE)
Genes with Cpm higher than threshold
expressedGeneBarplot(SE, PlotTitle='Before Filtering', cpmth=CpmFilt, Interactive=TRUE)
## WARNING: SE object has no "cpm" assay, the function will compute it
ggsave(filename=paste0(OutputFolder, '/ExpressedGenes.pdf'), 
       plot=expressedGeneBarplot(SE, PlotTitle='Before Filtering', cpmth=CpmFilt, Interactive=FALSE),
       device='pdf', width=10, height=5)
## WARNING: SE object has no "cpm" assay, the function will compute it


8. Density plot for Count distribution after filtering

Similarly to raw counts, I represent by density plot the count distribution also after filtering.

readCountDensityplot(assays(SE_Filt)$counts, plotTitle='Filtered Dataset')

9. Sample-to-sample correlation

Correlation matrix across samples calculated on the basis of the Spearman correlation. Heatmap annotation takes into consideration sample ‘Condition’ by default.

# Set heatmap size on the basis of the number of samples
Width <- 7 + (dim(SE_Filt)[2]- 8) * 0.24
Width <- ifelse(Width<=14, Width, 14)
Height <- 5 + (dim(SE_Filt)[2]- 8) * 0.22
#interactiveCorrHeatmap(SE_Filt, plotTitle='Filtered dataset', annotation_col=NULL, annotation_colors=NULL, myPalette=NULL) 

10. Principal Component Analysis

Cols <- c('DMSO' = 'grey30', 'CTL' = 'azure3', 
                 'AhHyd_Ag'='#F8766D', 'AhHyd_Inh'='#F8766D50',
                 'Andr_Ag'='#fccb17', 'Andr_Inh'='#C49A0050',  
                 "Estr_Ag"= '#53B400', "Estr_Inh"= '#53B40050', 
                 'GC_Ag' = '#00C094', 'GC_Inh' = '#00C09450',
                 'LivX_Ag' = '#00B6EB', 'LivX_Inh' = '#00B6EB50', 
                 'Ret_Ag' = '#A58AFF', 'Ret_Inh' = '#A58AFF50', 
                 'Thyr_Ag' = '#FB61D7', 'Thyr_Inh' = '#FB61D750'
                 )
pcaSE(SE_Filt, PlotTitle='First-Second Component', components=c(1,2), Interactive=TRUE, colVec = Cols)
SE_Filt@colData@listData[["SeqRun"]] <- as.character(SE_Filt@colData@listData[["SeqRun"]])
pcaSE(SE_Filt, PlotTitle='First-Second Component', components=c(1,2), condition = "SeqRun" , Interactive=TRUE)
pcaSE3d(SE_Filt, plotTitle='First-Third Components', colVec=Cols)

11. Savings

Saved objects

  • Intial SE without any filtering
  • SE_Bio after biotype selection, but before expressed gene selection and normalization
  • Analysis image
saveRDS(SE, paste0(OutputFolder, '/', Dataset, 'SE.rds')) 
saveRDS(SE_Bio, paste0(OutputFolder, '/', Dataset, 'SE_Bio.rds')) 

SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(OutputFolder, '/', Dataset, 'ExploratoryAnalysisWorkspace.RData'))
SessionInfo
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggplot2_3.4.1               SummarizedExperiment_1.28.0
##  [3] Biobase_2.58.0              GenomicRanges_1.50.2       
##  [5] GenomeInfoDb_1.34.9         IRanges_2.32.0             
##  [7] S4Vectors_0.36.1            BiocGenerics_0.44.0        
##  [9] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## [11] gridExtra_2.3               DT_0.27                    
## [13] RNASeqBulkExploratory_0.2.1
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.5             sass_0.4.5             edgeR_3.40.2          
##  [4] tidyr_1.3.0            jsonlite_1.8.4         viridisLite_0.4.1     
##  [7] bslib_0.4.2            highr_0.10             GenomeInfoDbData_1.2.9
## [10] ggrepel_0.9.3          yaml_2.3.7             pillar_1.8.1          
## [13] lattice_0.20-45        glue_1.6.2             limma_3.54.1          
## [16] digest_0.6.31          RColorBrewer_1.1-3     XVector_0.38.0        
## [19] colorspace_2.1-0       htmltools_0.5.4        Matrix_1.5-3          
## [22] pkgconfig_2.0.3        pheatmap_1.0.12        zlibbioc_1.44.0       
## [25] purrr_1.0.1            scales_1.2.1           tibble_3.2.1          
## [28] generics_0.1.3         farver_2.1.1           ellipsis_0.3.2        
## [31] cachem_1.0.7           withr_2.5.0            lazyeval_0.2.2        
## [34] cli_3.6.1              magrittr_2.0.3         evaluate_0.20         
## [37] fansi_1.0.4            textshaping_0.3.6      tools_4.2.1           
## [40] data.table_1.14.8      lifecycle_1.0.3        stringr_1.5.0         
## [43] plotly_4.10.1          munsell_0.5.0          locfit_1.5-9.7        
## [46] DelayedArray_0.24.0    compiler_4.2.1         jquerylib_0.1.4       
## [49] systemfonts_1.0.4      rlang_1.1.1            grid_4.2.1            
## [52] RCurl_1.98-1.10        rstudioapi_0.14        htmlwidgets_1.6.1     
## [55] crosstalk_1.2.0        bitops_1.0-7           labeling_0.4.2        
## [58] rmarkdown_2.20         gtable_0.3.1           R6_2.5.1              
## [61] knitr_1.42             dplyr_1.1.0            fastmap_1.1.1         
## [64] utf8_1.2.3             ragg_1.2.3             stringi_1.7.12        
## [67] Rcpp_1.0.10            vctrs_0.6.2            tidyselect_1.2.0      
## [70] xfun_0.37
Date
## [1] "Fri Jul 18 13:12:55 2025"