Values of RMarkdown parameters
for (i in 1:length(params))
print(paste('Parameter:', names(params)[i], ' - Value:', params[[i]], '- Class:', class(params[[i]])))
## [1] "Parameter: Dataset - Value: BenchmarkingOrganoids - Class: character"
## [1] "Parameter: CountFile - Value: ~/DataDir/ExternalData/BulkData/BenchmarkingOrganoids/OrgIPSAll_SalmonGeneCounts.txt - Class: character"
## [1] "Parameter: DesignFile - Value: ~/DataDir/ExternalData/BulkData/BenchmarkingOrganoids/DesignOrgIPS.txt - Class: character"
## [1] "Parameter: GeneAnnotationFile - Value: ~/DataDir/ExternalData/BulkData/BenchmarkingOrganoids/AnnotationDF.txt - Class: character"
## [1] "Parameter: OutputFolder - Value: ~/DataDir/ExternalData/BulkData/BenchmarkingOrganoids/OutputOrgIPS/ - Class: character"
## [1] "Parameter: CpmFilt - Value: 1 - Class: integer"
## [1] "Parameter: SampleFilt - Value: 4 - Class: integer"
library(RNASeqBulkExploratory) #Our Package
library(DT)
library(gridExtra)
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(ggplot2)
library(SEtools)
## Loading required package: sechm
library(sechm)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
source("../SignatureHelper.R")
<- params$Dataset
Dataset <- params$OutputFolder
OutputFolder
if (dir.exists(OutputFolder) == FALSE) {
dir.create(OutputFolder, recursive=FALSE)
}
<- read.table(params$CountFile, sep='\t', header=TRUE)
RawCounts colSums(RawCounts[,-1])
## Sample_KOLF2C1day25_S18177 Sample_KOLF2C1D25REP2_S18667
## 51609909 34372954
## Sample_KOLF2C1day50_S18184 Sample_KOLF2C1D48REP2_S18677
## 16505426 18340209
## Sample_S19785_KOLF2C1DAY100REP1 Sample_S19794_KOLF2C1DAY101REP2
## 38535806 36897320
## Sample_MIFF1day25_S18178 Sample_MIFF1D25REP2_S18669
## 47220949 40956318
## Sample_MIFF1day46_S18185 Sample_MIFF1D48REP2_S18679
## 31856803 29465545
## Sample_S19781_MIFF1DAY100REP1 Sample_S19790_MIFF1DAY101REP2
## 26819757 33399922
## Sample_S_3391Bday22_S18179 Sample_S_3391BD25REP2_S18668
## 60053191 34401356
## Sample_S_3391Bday46_S18186 Sample_S_3391BD48REP2_S18678
## 23921452 38628297
## Sample_S19784_3391BDAY100REP1 Sample_S19793_3391BDAY101REP2
## 34652428 33829523
## Sample_A15461D25REP1_S18665 Sample_A15461D25REP2_S18666
## 36553981 38100908
## Sample_A15461D48REP1_S18675 Sample_A15461D48REP2_S18676
## 29518149 25824797
## Sample_S19779_A15461DAY100REP1 Sample_S19788_A15461DAY101REP2
## 28574158 29918356
## Sample_S20430_A15461_DAY153_REP1 Sample_S20431_KOLF2C1_DAY153_REP1
## 27249536 17705141
## Sample_S20432_3391B_DAY153_REP1 Sample_S20433_MIFF1_DAY149_REP1
## 26906984 22993777
## Sample_S20434_A15461_DAY150_REP2 Sample_S20435_KOLF2C1_DAY150_REP2
## 18833743 21503329
## Sample_S20436_3391B_DAY150_REP2 Sample_S20437_MIFF1_DAY150_REP2
## 20870677 20110136
## Sample_S20438_A15461_DAY205_REP1 Sample_S20439_KOLF2C1_DAY205_REP1
## 31606310 21779953
## Sample_S20440_3391B_DAY205_REP1 Sample_S20441_MIFF1_DAY205_REP1
## 21214965 22744354
## Sample_S20442_KOLF2C1_DAY193_REP2 Sample_S20443_3391B_DAY193_REP2
## 29948152 26832042
## Sample_S20444_MIFF1_DAY193_REP2 Sample_S19798_A15461iPSC
## 26901361 25421514
## Sample_S19797_KOLF2C1iPSC Sample_S20260_3391BiPSC
## 24903065 27715530
## Sample_S20259_MIFF1iPSC
## 35139399
<- read.table(params$DesignFile, sep='\t', header=TRUE)
Design str(Design)
## 'data.frame': 43 obs. of 13 variables:
## $ InternaUniqueID: chr "Sample_KOLF2C1day25_S18177" "Sample_KOLF2C1D25REP2_S18667" "Sample_KOLF2C1day50_S18184" "Sample_KOLF2C1D48REP2_S18677" ...
## $ HRID : chr "Sample_KOLF2C1day25_S18177" "Sample_KOLF2C1D25REP2_S18667" "Sample_KOLF2C1day50_S18184" "Sample_KOLF2C1D48REP2_S18677" ...
## $ SampleName : chr "S_KOLF2C1D25REP1" "S_KOLF2C1D25REP2" "S_KOLF2C1D50REP1" "S_KOLF2C1D48REP2" ...
## $ Speciment : chr "Organoid" "Organoid" "Organoid" "Organoid" ...
## $ Condition : chr "Day25" "Day25" "Day50" "Day50" ...
## $ Group : chr "Org_25" "Org_25" "Org_50" "Org_50" ...
## $ Line : chr "L_KOLF2C1" "L_KOLF2C1" "L_KOLF2C1" "L_KOLF2C1" ...
## $ Sex : chr "M" "M" "M" "M" ...
## $ DiffBatch : int 1 2 2 5 5 6 1 2 4 5 ...
## $ Seq : chr "PairedEnd" "PairedEnd" "PairedEnd" "PairedEnd" ...
## $ RNASelection : chr "RiboZero" "RiboZero" "RiboZero" "RiboZero" ...
## $ SeqPlatform : chr "Novaseq" "Novaseq" "Novaseq" "Novaseq" ...
## $ Run : int 180201 180420 180201 180420 180622 180622 180201 180420 180201 180420 ...
as.data.frame(lapply(Design, factor)) %>%
::datatable(class='hover', rownames=FALSE, caption='Sample identity and attributes',
DTfilter='top', escape=TRUE, extension='Buttons',
options=list(pageLength=10, dom='Bfrtip',
columnDefs=list(list(className='dt-center', targets=c(5:(length(Design)-1)),
visible=FALSE)), #as if indexing starts from 0
buttons=list(I('colvis'), c('csv', 'excel')))
)
The dataset is structured in 58288 genes measured in 43 samples.
if (is.null(params$GeneAnnotation)==FALSE){
<- read.table(params$GeneAnnotationFile, sep='\t', header=TRUE)
GeneAnnotation if(identical((RawCounts$Gene), GeneAnnotation$Gene)==FALSE){
stop('ERROR! Gene order in count matrix and annotation matrix is not consistent. \n Please specify a data frame with the correct order.')
} }
if ('hgnc_symbol' %in% colnames(GeneAnnotation)){
<- dplyr::rename(GeneAnnotation, HGNCSymbol=hgnc_symbol)
GeneAnnotation
}
if ('external_gene_name' %in% colnames(GeneAnnotation)){
<- dplyr::rename(GeneAnnotation, GeneName=external_gene_name)
GeneAnnotation
}
if ('gene_biotype' %in% colnames(GeneAnnotation)){
<- dplyr::rename(GeneAnnotation, GeneBiotype=gene_biotype)
GeneAnnotation
}
if ('chromosome_name' %in% colnames(GeneAnnotation)){
<- dplyr::rename(GeneAnnotation, Chr=chromosome_name)
GeneAnnotation
}
if ('start_position' %in% colnames(GeneAnnotation)){
<- dplyr::rename(GeneAnnotation, Start=start_position)
GeneAnnotation
}
if ('end_position' %in% colnames(GeneAnnotation)){
<- dplyr::rename(GeneAnnotation, End=end_position)
GeneAnnotation }
if(identical(colnames(RawCounts)[-1], Design$HRID)==FALSE){
stop('ERROR! Gene order in count matrix and annotation matrix is not consistent. \n
Please specify a data frame with the correct order.')
}
<- createSE(countTable=RawCounts, sampleMeta=Design, geneMeta=GeneAnnotation,
SE roundCounts=TRUE, showDuplicates=TRUE)
## The function removed 45 duplicated genes
## [1] "ENSG00000002586.18_PAR_Y" "ENSG00000124333.15_PAR_Y"
## [3] "ENSG00000124334.17_PAR_Y" "ENSG00000167393.17_PAR_Y"
## [5] "ENSG00000168939.11_PAR_Y" "ENSG00000169084.13_PAR_Y"
## [7] "ENSG00000169093.15_PAR_Y" "ENSG00000169100.13_PAR_Y"
## [9] "ENSG00000178605.13_PAR_Y" "ENSG00000182162.10_PAR_Y"
## [11] "ENSG00000182378.13_PAR_Y" "ENSG00000182484.15_PAR_Y"
## [13] "ENSG00000185203.12_PAR_Y" "ENSG00000185291.11_PAR_Y"
## [15] "ENSG00000185960.13_PAR_Y" "ENSG00000196433.12_PAR_Y"
## [17] "ENSG00000197976.11_PAR_Y" "ENSG00000198223.16_PAR_Y"
## [19] "ENSG00000205755.11_PAR_Y" "ENSG00000214717.11_PAR_Y"
## [21] "ENSG00000223274.6_PAR_Y" "ENSG00000223484.7_PAR_Y"
## [23] "ENSG00000223511.6_PAR_Y" "ENSG00000223571.6_PAR_Y"
## [25] "ENSG00000223773.7_PAR_Y" "ENSG00000225661.7_PAR_Y"
## [27] "ENSG00000226179.6_PAR_Y" "ENSG00000227159.8_PAR_Y"
## [29] "ENSG00000228410.6_PAR_Y" "ENSG00000228572.7_PAR_Y"
## [31] "ENSG00000229232.6_PAR_Y" "ENSG00000230542.6_PAR_Y"
## [33] "ENSG00000234622.6_PAR_Y" "ENSG00000234958.6_PAR_Y"
## [35] "ENSG00000236017.8_PAR_Y" "ENSG00000236871.7_PAR_Y"
## [37] "ENSG00000237040.6_PAR_Y" "ENSG00000237531.6_PAR_Y"
## [39] "ENSG00000237801.6_PAR_Y" "ENSG00000265658.6_PAR_Y"
## [41] "ENSG00000270726.6_PAR_Y" "ENSG00000275287.5_PAR_Y"
## [43] "ENSG00000277120.5_PAR_Y" "ENSG00000280767.3_PAR_Y"
## [45] "ENSG00000281849.3_PAR_Y"
SE contains information about 58243 genes in 43 samples.
We decide to select here only protein-coding genes
<- biotypeSelectSE(SE, lncRNA=FALSE, otherBios=NULL, showRemoved=TRUE)
SE_Bio ## UPDATED library.size slot
## A total of 38425 rows out of 58243 were discarded, belonging to the following biotypes:
##
## 3prime_overlapping_ncRNA antisense_RNA
## 31 5517
## bidirectional_promoter_lncRNA IG_C_gene
## 19 14
## IG_C_pseudogene IG_D_gene
## 9 37
## IG_J_gene IG_J_pseudogene
## 18 3
## IG_pseudogene IG_V_gene
## 1 144
## IG_V_pseudogene lincRNA
## 188 7493
## macro_lncRNA miRNA
## 1 1879
## misc_RNA Mt_rRNA
## 2212 2
## Mt_tRNA non_coding
## 22 3
## polymorphic_pseudogene processed_pseudogene
## 63 10233
## processed_transcript pseudogene
## 543 18
## ribozyme rRNA
## 8 543
## scaRNA scRNA
## 49 1
## sense_intronic sense_overlapping
## 904 189
## snoRNA snRNA
## 943 1900
## sRNA TEC
## 5 1066
## TR_C_gene TR_D_gene
## 6 4
## TR_J_gene TR_J_pseudogene
## 79 4
## TR_V_gene TR_V_pseudogene
## 108 30
## transcribed_processed_pseudogene transcribed_unitary_pseudogene
## 462 111
## transcribed_unprocessed_pseudogene translated_processed_pseudogene
## 828 2
## unitary_pseudogene unprocessed_pseudogene
## 95 2637
## vaultRNA
## 1
## A total of 19818 rows out of 58243 were kept, belonging to the following biotypes:
##
## protein_coding
## 19818
<- rowData(SE)[!(rownames(rowData(SE)) %in% rownames(rowData(SE_Bio))), ] RemovedGenes
After the gene selection based on biotypes, the dataset is structured in 19818 genes measured in 43 samples. 38425 have been discarded.
if(! identical(names(assays(SE_Bio)$counts), rownames(colData(SE_Bio)))){
stop('Inconsistencies in sample metadata for SE object')
}
if(! identical(names(assays(SE_Bio)$counts), colData(SE_Bio)$HRID)){
stop('Inconsistencies in sample metadata for SE object')
}
<- read.table(file='~/DataDir/ExternalData/BulkData/BenchmarkingOrganoids/GenecodeV27GeneLength.txt',
GeneLength sep='\t', header=TRUE)
<- dplyr::left_join(as.data.frame(rowData(SE_Bio))[,1:3], GeneLength, by=c('Gene'='Gene')) SelLenght
if(identical(SelLenght$Gene, rowData(SE_Bio)$Gene)){
rowData(SE_Bio)$GeneLength <- SelLenght$Length
print('Gene Lengh information added to SE')
else {
} print('Information on gene lenght is not compatible with SE object. Gene Lenght information not added to SE')
}## [1] "Gene Lengh information added to SE"
<- params$CpmFilt
CpmFilt if(is.numeric(CpmFilt)==FALSE){
stop('ERROR! Cpm threshold for gene filtering has a non-numeric value')
}
<- ifelse(params$SampleFilt=='Default', dim(SE_Bio)[2]/2, params$SampleFilt)
SampleFilt if(is.numeric(SampleFilt)==FALSE){
stop('ERROR! Sample threshold for gene filtering has a non-numeric value')
}
Genes having an expression lower than 1 in at least 4 samples are filtered out.
$lib.size
SE_Bio## Sample_KOLF2C1day25_S18177 Sample_KOLF2C1D25REP2_S18667
## 35966674 24130082
## Sample_KOLF2C1day50_S18184 Sample_KOLF2C1D48REP2_S18677
## 8946628 12691980
## Sample_S19785_KOLF2C1DAY100REP1 Sample_S19794_KOLF2C1DAY101REP2
## 29092606 27522714
## Sample_MIFF1day25_S18178 Sample_MIFF1D25REP2_S18669
## 29677523 27411278
## Sample_MIFF1day46_S18185 Sample_MIFF1D48REP2_S18679
## 17982695 18205132
## Sample_S19781_MIFF1DAY100REP1 Sample_S19790_MIFF1DAY101REP2
## 19886743 25832968
## Sample_S_3391Bday22_S18179 Sample_S_3391BD25REP2_S18668
## 41468787 22972771
## Sample_S_3391Bday46_S18186 Sample_S_3391BD48REP2_S18678
## 12161257 23437065
## Sample_S19784_3391BDAY100REP1 Sample_S19793_3391BDAY101REP2
## 24695980 25823667
## Sample_A15461D25REP1_S18665 Sample_A15461D25REP2_S18666
## 22730484 25142562
## Sample_A15461D48REP1_S18675 Sample_A15461D48REP2_S18676
## 20868997 18505787
## Sample_S19779_A15461DAY100REP1 Sample_S19788_A15461DAY101REP2
## 22659558 23980057
## Sample_S20430_A15461_DAY153_REP1 Sample_S20431_KOLF2C1_DAY153_REP1
## 16681641 5621105
## Sample_S20432_3391B_DAY153_REP1 Sample_S20433_MIFF1_DAY149_REP1
## 15688660 8033761
## Sample_S20434_A15461_DAY150_REP2 Sample_S20435_KOLF2C1_DAY150_REP2
## 8488859 13885616
## Sample_S20436_3391B_DAY150_REP2 Sample_S20437_MIFF1_DAY150_REP2
## 10806175 9503179
## Sample_S20438_A15461_DAY205_REP1 Sample_S20439_KOLF2C1_DAY205_REP1
## 18505632 12059447
## Sample_S20440_3391B_DAY205_REP1 Sample_S20441_MIFF1_DAY205_REP1
## 10981494 12433247
## Sample_S20442_KOLF2C1_DAY193_REP2 Sample_S20443_3391B_DAY193_REP2
## 17176679 18144207
## Sample_S20444_MIFF1_DAY193_REP2 Sample_S19798_A15461iPSC
## 15599116 20625329
## Sample_S19797_KOLF2C1iPSC Sample_S20260_3391BiPSC
## 18616726 20478866
## Sample_S20259_MIFF1iPSC
## 23226563
<- filterSE(SE_Bio, cpmTh=CpmFilt, sampleTh=SampleFilt)
SE_Filt ## UPDATED library.size slot
#$lib.size
<- normTmmSE(SE_Filt, useNormFactors=TRUE, priorCount=0.25)
SE_Filt ## Warning in normTmmSE(SE_Filt, useNormFactors = TRUE, priorCount = 0.25): The
## following colData columns will be recomputed: lib.size
#cpm and logcpm are calculated by normTmmSE
assays(SE_Filt)$fpkm <- edgeR::rpkm(SE_Filt, normalized.lib.sizes = TRUE, gene.length=rowData(SE_Filt)$GeneLength)
assays(SE_Filt)$logfpkm <- edgeR::rpkm(SE_Filt, normalized.lib.sizes = TRUE, gene.length=rowData(SE_Filt)$GeneLength,
log=TRUE, prior.count=0.25)
After filtering out lowly-expressed genes, the dataset is structured in 16638 genes measured in 43 samples.
For the downstream interrogation, it become useful to have the gene names as SE row names, that are used by sechm for selecting genes. I also have to discard genes with duplicated gene names.
rownames(SE_Filt) <- rowData(SE_Filt)$GeneName
<- SE_Filt[!duplicated(rownames(SE_Filt)), ] SE_Filt
I use a function from SEtools to calculate the fold-change compared to the controls. This is done on logcpm.
<- as.vector(which(SE_Filt$Condition=='Day0'))
Day0 <- SEtools::log2FC(SE_Filt, fromAssay='logcpm', controls=Day0, isLog=TRUE, toAssay='log2FC') SE_Filt
metadata(SE_Filt)$anno_colors <- list(Condition = c("Day0" = "#ffff3f", "Day25" = "#eeef20", "Day50" = "#dddf00", "Day100" = "#d4d700", "Day150" = "#bfd200", "Day200" = "#aacc00"), Sex = c("F" = "#f4978e", "M" = "#277da1"))
<- read.table(file = '~/DataDir/ExternalData/Receptors/ReceptorsComplete.txt',
GS sep='\t', header=TRUE)
as.data.frame(GS) %>%
::datatable(class='hover', rownames=FALSE, caption='Hormonal receptors',
DTfilter='top', escape=TRUE, extension='Buttons',
options=list(pageLength=10, dom='Bfrtip',
buttons=list(I('colvis'), c('csv', 'excel')))
)
<- GS %>% dplyr::pull(GeneName)
Genes
! Genes %in% row.names(SE_Filt)]
Genes[## [1] "DIO1" "THRSP" "PTGER1" "PTGER2"
rowData(SE_Filt)$Signatures <- NA
for (i in which(rowData(SE_Filt)$GeneName %in% Genes)){
<- row.names(SE_Filt)[i]
gene rowData(SE_Filt)[i, 'Signatures'] <-GS[GS$GeneName==gene, "Signature"]
}
<- scales::hue_pal()(12)[1:length(unique(GS$Signature))]
colors_sig names(colors_sig) <- names(table(GS$Signature))
metadata(SE_Filt)$anno_colors$Signatures <- colors_sig
The following genes are not selected as expressed: DIO1, THRSP, PTGER1, PTGER2.
colData(SE_Filt)$Condition <- factor(colData(SE_Filt)$Condition, levels = c("Day0", "Day25", "Day50", "Day100", "Day150", "Day200"))
<- as.data.frame(colData(SE_Filt))%>%dplyr::arrange(Condition)%>%rownames() sorting
assays(SE_Filt[row.names(SE_Filt) %in% Genes, ])$logfpkm %>% quantile(seq(0,1, by=0.1))
## 0% 10% 20% 30% 40% 50%
## -10.2298046 -3.0364366 -1.8557710 -1.0502424 -0.2595567 0.5403657
## 60% 70% 80% 90% 100%
## 1.1513351 1.8766521 2.6346140 3.8837825 6.9090993
<- viridisLite::cividis(length(seq(-10, 7, by=0.2)))
ExpCols
sechm(SE_Filt[, sorting], features=Genes, assayName="logfpkm", gaps_row="Signatures", top_annotation = c("Condition", "Sex"), show_rownames = TRUE, left_annotation = "Signatures", sortRowsOn=NULL, cluster_rows=FALSE, show_colnames=TRUE,
hmcols=ExpCols, breaks=seq(-10, 7, by=0.2), column_title = "LogFpkm")
# breaks FALSE to avoid a symmetrical scale
<- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')
ScaledCols
::sechm(SE_Filt[, sorting], features=Genes, assayName="logfpkm", gaps_row="Signatures", top_annotation = c("Condition", "Sex"), show_rownames=TRUE, left_annotation = "Signatures", show_colnames=TRUE, sortRowsOn=NULL, cluster_rows=FALSE, hmcols=ScaledCols, do.scale=TRUE, breaks=0.8, column_title = "Scaled LogFpkm") sechm
::sechm(SE_Filt[, sorting], features=Genes, assayName="logcpm", gaps_row="Signatures", top_annotation = c("Condition", "Sex"), show_rownames=TRUE, left_annotation = "Signatures", show_colnames=TRUE, sortRowsOn=NULL, cluster_rows=FALSE, hmcols=ScaledCols, do.scale=TRUE, breaks=0.8, column_title = "Scaled LogCpm") sechm
sechm(SE_Filt[, sorting], features=Genes, assayName="log2FC", gaps_row="Signatures", top_annotation = c("Condition", "Sex"), left_annotation = "Signatures", show_colnames=TRUE, sortRowsOn=NULL, cluster_rows=FALSE,
show_rownames = TRUE, hmcols=ScaledCols,
do.scale=FALSE, breaks = 0.85, column_title = "LogFC versus Day0")
<- Design %>% dplyr::arrange(factor(Condition, levels = c("Day0", "Day25", "Day50", "Day100", "Day150", "Day200")))
Design_sorted $Condition <- factor(Design_sorted$Condition, levels = c("Day0", "Day25", "Day50", "Day100", "Day150", "Day200")) Design_sorted
<- assays(SE_Filt[,sorting])$logfpkm LogFpkmDF
CuratedSig list is created:
#prepare Design for functions
<- list()
CuratedSig $names <- Design_sorted$InternaUniqueID
Design_sorted# Check for missing genes
$GeneName[!(GS$GeneName %in% rownames(LogFpkmDF))] GS
## [1] "DIO1" "THRSP" "PTGER1" "PTGER2"
colnames(GS)[2] <- "Population"
#excluding the missing ones
<- GS$GeneName[(GS$GeneName %in% rownames(LogFpkmDF))] Genes
# Retrieval of expression values for gene signatures
$LogFpkm <- LogFpkmSignature(MetaData=Design_sorted, LogFpkmData=LogFpkmDF, GeneSig=Genes)
CuratedSighead(CuratedSig$LogFpkm)
## InternaUniqueID HRID SampleName
## 1 Sample_S19798_A15461iPSC Sample_S19798_A15461iPSC S_A15461iPSC
## 2 Sample_S19797_KOLF2C1iPSC Sample_S19797_KOLF2C1iPSC S_KOLF2C1iPSC
## 3 Sample_S20260_3391BiPSC Sample_S20260_3391BiPSC S_3391BiPSC
## 4 Sample_S20259_MIFF1iPSC Sample_S20259_MIFF1iPSC S_MIFF1iPSC
## 5 Sample_KOLF2C1day25_S18177 Sample_KOLF2C1day25_S18177 S_KOLF2C1D25REP1
## 6 Sample_KOLF2C1D25REP2_S18667 Sample_KOLF2C1D25REP2_S18667 S_KOLF2C1D25REP2
## Speciment Condition Group Line Sex DiffBatch Seq RNASelection
## 1 IPSC Day0 IPSC L_A15461 M 0 PairedEnd RiboZero
## 2 IPSC Day0 IPSC L_KOLF2C1 M 0 PairedEnd RiboZero
## 3 IPSC Day0 IPSC L_3391B F 0 PairedEnd RiboZero
## 4 IPSC Day0 IPSC L_MIFF1 M 0 PairedEnd RiboZero
## 5 Organoid Day25 Org_25 L_KOLF2C1 M 1 PairedEnd RiboZero
## 6 Organoid Day25 Org_25 L_KOLF2C1 M 2 PairedEnd RiboZero
## SeqPlatform Run names THRB THRA THRAP3
## 1 Novaseq 180622 Sample_S19798_A15461iPSC -2.1931085 1.5977961 6.129991
## 2 Novaseq 180622 Sample_S19797_KOLF2C1iPSC -2.2009883 1.4883257 5.837180
## 3 Novaseq 180622 Sample_S20260_3391BiPSC -1.8554995 0.6802578 6.631729
## 4 Novaseq 180622 Sample_S20259_MIFF1iPSC -0.9966187 0.9043768 6.063937
## 5 Novaseq 180201 Sample_KOLF2C1day25_S18177 -2.0723413 2.9438559 5.373216
## 6 Novaseq 180420 Sample_KOLF2C1D25REP2_S18667 -1.8742316 2.8944538 5.559790
## DIO2 DIO3 SLC16A10 SLC16A2 SLC7A5 KLF9 ESRRG
## 1 -4.875201 -2.7903240 0.2747060 3.812203 5.673317 -0.1483813 -0.0502892
## 2 -3.632907 -2.4336093 1.6034017 3.668992 6.101897 -0.2099453 -0.2796473
## 3 -4.214556 -2.2157610 1.1579555 3.718660 6.909099 -1.2267247 -0.9296575
## 4 -1.789884 -2.2440799 1.8238977 4.132668 6.519561 -0.3337586 0.2361960
## 5 -2.416884 -0.3220758 0.8562001 4.318113 3.695471 -2.2374459 -0.7802904
## 6 -4.132264 0.7504452 0.6531462 3.954587 4.341889 -2.3605386 -1.1177214
## ESRRA GPER1 ESR1 ESR2 ESRRB CYP19A1 AR
## 1 2.661603 -1.3616097 -6.065618 -0.98487209 -0.08942631 -2.8417106 1.400413
## 2 2.707611 -0.9072015 -4.653845 -0.85198271 -0.85279263 -2.3290379 2.015931
## 3 2.331885 -2.0783794 -5.751873 -1.13108063 -1.77821989 -2.5126797 1.712929
## 4 2.191880 0.4128708 -4.393556 0.08782778 -1.11602014 -0.8411801 2.088872
## 5 1.560931 -1.2243860 -5.573497 -2.09730282 -1.94184400 -5.7957463 0.309950
## 6 2.180292 -0.3571294 -6.277148 -2.02593066 -1.83416159 -5.5616470 0.388066
## RBP4 RARA RARB RARG RXRA RXRB RXRG
## 1 -3.19396229 1.9705281 -0.9569565 3.0347948 2.064013 3.006055 -2.584866
## 2 -4.44385171 1.2900800 -0.8097196 3.7099950 1.384433 3.151959 -4.485757
## 3 -3.71250016 0.3480331 -0.9104001 2.9734325 1.190648 2.560231 -7.715392
## 4 -2.66625711 0.7393304 0.1208454 3.1015035 1.488644 2.946460 -5.591418
## 5 1.59554310 2.7886136 1.2190075 0.9593060 2.263009 3.303279 2.478974
## 6 -0.02694891 3.3419877 0.8549062 0.4866657 2.557999 3.459517 3.317292
## AHR NR3C1 NR1H2 NR1H3 PTGER3 PTGER4 PPARA
## 1 1.212590 -0.126158327 2.627867 2.337231 -9.555549 -1.9412456 0.5327247
## 2 1.719962 0.005060046 2.387083 1.956165 -3.761861 -0.7039488 0.9382047
## 3 1.736231 0.210646760 2.090848 1.568786 -3.516814 -1.4218143 0.3927880
## 4 1.756420 0.520267204 1.686202 1.823277 -3.695037 -1.3014188 1.3796725
## 5 1.905715 1.508664534 2.257890 1.300398 -2.932664 -5.3628160 0.6873919
## 6 1.980878 0.915311385 2.848942 1.513940 -2.849518 -3.0788318 0.6117272
## PPARD PPARG PGR VDR
## 1 1.490878 -1.124317 -7.131813 -1.6503913
## 2 1.467065 -1.364259 -6.079195 -0.8580292
## 3 1.006283 -2.348425 -7.964269 -1.5646958
## 4 1.274859 -1.256692 -5.614358 -0.8064014
## 5 1.761708 -2.255845 -6.421668 -2.7437145
## 6 2.097961 -1.687226 -6.129092 -3.1604402
# Calculation of mean expression across stages
$LogFpkmMean <- SigLogFpkmMean(CuratedSig$LogFpkm, StartCol=15, GroupCol='Condition', MetaInfo= GS, Arrange='Population')
CuratedSig## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(StartCol)
##
## # Now:
## data %>% select(all_of(StartCol))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'GeneName'. You can override using the
## `.groups` argument.
$LogFpkmMean
CuratedSig## # A tibble: 210 × 5
## # Groups: GeneName [35]
## GeneName Condition Count ExpMean Population
## <chr> <fct> <int> <dbl> <chr>
## 1 AHR Day0 4 1.61 AhHyd
## 2 AHR Day25 8 1.36 AhHyd
## 3 AHR Day50 8 1.07 AhHyd
## 4 AHR Day100 8 2.09 AhHyd
## 5 AHR Day150 8 2.51 AhHyd
## 6 AHR Day200 7 1.97 AhHyd
## 7 AR Day0 4 1.80 Androgen
## 8 AR Day25 8 0.756 Androgen
## 9 AR Day50 8 0.147 Androgen
## 10 AR Day100 8 -0.661 Androgen
## # … with 200 more rows
print(paste('MeanLog2Fpkm Genes', dim(CuratedSig$LogFpkmMean)[1]))
## [1] "MeanLog2Fpkm Genes 210"
print(paste('Min Value', round(min(CuratedSig$LogFpkmMean$ExpMean), 2)))
## [1] "Min Value -8.06"
print(paste('Max Value', round(max(CuratedSig$LogFpkmMean$ExpMean), 2)))
## [1] "Max Value 6.3"
<- scales::hue_pal()(12)
ColVec
colnames(CuratedSig$LogFpkmMean)[2] <- "Stage"
$Lolli <- LolliPopCols(Data=CuratedSig$LogFpkmMean, PointSize=2.5,
CuratedSigSegSize=1.5, yLow=0, yHigh=7, CeilDown=0,
CeilUp=7, unit='LogFpkm', cols=ColVec,
Shape=15, GeneSize=5, filterZero=FALSE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
$Lolli CuratedSig
quantile(CuratedSig$LogFpkmMean$ExpMean)
## 0% 25% 50% 75% 100%
## -8.056777 -1.446061 0.550154 2.083088 6.300968
# reference stage for each dataset
# Retrieval of expression values for gene signatures
$LogFpkmDelta <- DeltaLogFpkmMean(CuratedSig$LogFpkmMean, GroupCol='Stage', DiscardStage="Day0",
CuratedSigRefStage="SDay25", Arrange='Population', Th=-1.5, Substitute=NA)
$LogFpkmDelta
CuratedSig## # A tibble: 140 × 4
## # Groups: GeneName [35]
## GeneName Population Stage ExpMean
## <chr> <chr> <chr> <dbl>
## 1 AHR AhHyd D2_SDay50_SDay25 -0.291
## 2 AHR AhHyd D3_SDay100_SDay25 0.726
## 3 AHR AhHyd D4_SDay150_SDay25 1.15
## 4 AHR AhHyd D5_SDay200_SDay25 0.607
## 5 AR Androgen D2_SDay50_SDay25 -0.609
## 6 AR Androgen D3_SDay100_SDay25 -1.42
## 7 AR Androgen D4_SDay150_SDay25 -1.10
## 8 AR Androgen D5_SDay200_SDay25 -1.07
## 9 CYP19A1 Estrogen D2_SDay50_SDay25 NA
## 10 ESR1 Estrogen D2_SDay50_SDay25 NA
## # … with 130 more rows
print(paste('DeltaLogFpkm Genes', dim(CuratedSig$LogFpkmDelta)[1]))
## [1] "DeltaLogFpkm Genes 140"
print(paste('Min Value', round(min(CuratedSig$LogFpkmDelta$ExpMean, na.rm=T),2)))
## [1] "Min Value -2.39"
print(paste('Max Value', round(max(CuratedSig$LogFpkmDelta$ExpMean, na.rm=T),2)))
## [1] "Max Value 3.22"
$LogFpkmDelta$Stage[CuratedSig$LogFpkmDelta$Stage=="D2_SDay50_SDay25"] <- "Day50 vs Day25"
CuratedSig$LogFpkmDelta$Stage[CuratedSig$LogFpkmDelta$Stage=="D3_SDay100_SDay25"] <- "Day100 vs Day25"
CuratedSig$LogFpkmDelta$Stage[CuratedSig$LogFpkmDelta$Stage=="D4_SDay150_SDay25"] <- "Day150 vs Day25"
CuratedSig$LogFpkmDelta$Stage[CuratedSig$LogFpkmDelta$Stage=="D5_SDay200_SDay25"] <- "Day200 vs Day25"
CuratedSig
$LogFpkmDelta$Stage <- factor(CuratedSig$LogFpkmDelta$Stage, levels= c("Day50 vs Day25", "Day100 vs Day25", "Day150 vs Day25", "Day200 vs Day25")) CuratedSig
$LolliDelta <- LolliPopCols(CuratedSig$LogFpkmDelta, PointSize=4, SegSize=1.5,
CuratedSigcols=ColVec, yLow=-3, yHigh=4, CeilUp=4, CeilDown=-3,
unit='DeltaLogFpkm', Shape=18, GeneSize=5, filterZero=FALSE)
$LolliDelta
CuratedSig## Warning: Removed 28 rows containing missing values (`geom_point()`).
## Warning: Removed 28 rows containing missing values (`geom_segment()`).
$LogFpkmDeltaII <- DeltaLogFpkmMean(CuratedSig$LogFpkmMean, GroupCol='Stage', DiscardStage=NULL,
CuratedSigRefStage="SDay0", Arrange='Population', Th=-1.5, Substitute=NA)
$LogFpkmDeltaII
CuratedSig## # A tibble: 175 × 4
## # Groups: GeneName [35]
## GeneName Population Stage ExpMean
## <chr> <chr> <chr> <dbl>
## 1 AHR AhHyd D2_SDay25_SDay0 -0.246
## 2 AHR AhHyd D3_SDay50_SDay0 -0.536
## 3 AHR AhHyd D4_SDay100_SDay0 0.480
## 4 AHR AhHyd D5_SDay150_SDay0 0.901
## 5 AHR AhHyd D6_SDay200_SDay0 0.362
## 6 AR Androgen D2_SDay25_SDay0 -1.05
## 7 AR Androgen D3_SDay50_SDay0 -1.66
## 8 AR Androgen D4_SDay100_SDay0 -2.47
## 9 AR Androgen D5_SDay150_SDay0 -2.15
## 10 AR Androgen D6_SDay200_SDay0 -2.12
## # … with 165 more rows
print(paste('DeltaLogFpkm Genes', dim(CuratedSig$LogFpkmDeltaII)[1]))
## [1] "DeltaLogFpkm Genes 175"
print(paste('Min Value', round(min(CuratedSig$LogFpkmDeltaII$ExpMean, na.rm=T),2)))
## [1] "Min Value -5.05"
print(paste('Max Value', round(max(CuratedSig$LogFpkmDeltaII$ExpMean, na.rm=T),2)))
## [1] "Max Value 7.44"
$LogFpkmDeltaII$Stage[CuratedSig$LogFpkmDeltaII$Stage=="D2_SDay25_SDay0"] <- "Day25 vs Day0"
CuratedSig$LogFpkmDeltaII$Stage[CuratedSig$LogFpkmDeltaII$Stage=="D3_SDay50_SDay0"] <- "Day50 vs Day0"
CuratedSig$LogFpkmDeltaII$Stage[CuratedSig$LogFpkmDeltaII$Stage=="D4_SDay100_SDay0"] <- "Day100 vs Day0"
CuratedSig$LogFpkmDeltaII$Stage[CuratedSig$LogFpkmDeltaII$Stage=="D5_SDay150_SDay0"] <- "Day150 vs Day0"
CuratedSig$LogFpkmDeltaII$Stage[CuratedSig$LogFpkmDeltaII$Stage=="D6_SDay200_SDay0"] <- "Day200 vs Day0"
CuratedSig
$LogFpkmDeltaII$Stage <- factor(CuratedSig$LogFpkmDeltaII$Stage, levels= c("Day25 vs Day0", "Day50 vs Day0", "Day100 vs Day0", "Day150 vs Day0", "Day200 vs Day0")) CuratedSig
$LolliDeltaII <- LolliPopCols(CuratedSig$LogFpkmDeltaII, PointSize=4, SegSize=1.5,
CuratedSigcols=ColVec, yLow=-4.5, yHigh=8, CeilUp=8, CeilDown=-4.5,
unit='DeltaLogFpkm', Shape=18, GeneSize=5, filterZero=FALSE)
$LolliDeltaII
CuratedSig## Warning: Removed 27 rows containing missing values (`geom_point()`).
## Warning: Removed 27 rows containing missing values (`geom_segment()`).
Saving in Pdf
ggsave(plot=CuratedSig$Lolli, filename=paste0(OutputFolder, 'Cheroni_LogFpkm.pdf'), width=10, height=10)
ggsave(plot=CuratedSig$Lolli, filename=paste0(OutputFolder, 'Cheroni_LogFpkm.png'), width=10, height=10)
ggsave(plot=CuratedSig$LolliDelta, filename=paste0(OutputFolder, 'Cheroni_DeltaLogFpkm_vs25.pdf'), width=10, height=10)
## Warning: Removed 28 rows containing missing values (`geom_point()`).
## Warning: Removed 28 rows containing missing values (`geom_segment()`).
ggsave(plot=CuratedSig$LolliDelta, filename=paste0(OutputFolder, 'Cheroni_DeltaLogFpkm_vs25.png'), width=10, height=10)
## Warning: Removed 28 rows containing missing values (`geom_point()`).
## Removed 28 rows containing missing values (`geom_segment()`).
<- sessionInfo()
SessionInfo <- date()
Date save.image(paste0(OutputFolder, '/', Dataset, 'EndpointsReceptorsExplorationOrganoidsWorkspace.RData'))
SessionInfo
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] dplyr_1.1.0 tidyr_1.3.0
## [3] SEtools_1.12.0 sechm_1.6.0
## [5] ggplot2_3.4.1 SummarizedExperiment_1.28.0
## [7] Biobase_2.58.0 GenomicRanges_1.50.2
## [9] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [11] S4Vectors_0.36.1 BiocGenerics_0.44.0
## [13] MatrixGenerics_1.10.0 matrixStats_0.63.0
## [15] gridExtra_2.3 DT_0.27
## [17] RNASeqBulkExploratory_0.2.1
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.16 colorspace_2.1-0 rjson_0.2.21
## [4] ellipsis_0.3.2 circlize_0.4.15 XVector_0.38.0
## [7] GlobalOptions_0.1.2 clue_0.3-64 rstudioapi_0.14
## [10] farver_2.1.1 bit64_4.0.5 AnnotationDbi_1.60.0
## [13] fansi_1.0.4 splines_4.2.1 codetools_0.2-19
## [16] doParallel_1.0.17 cachem_1.0.7 geneplotter_1.76.0
## [19] knitr_1.42 jsonlite_1.8.4 annotate_1.76.0
## [22] cluster_2.1.4 png_0.1-8 pheatmap_1.0.12
## [25] compiler_4.2.1 httr_1.4.5 Matrix_1.5-3
## [28] fastmap_1.1.1 limma_3.54.1 cli_3.6.1
## [31] htmltools_0.5.4 tools_4.2.1 gtable_0.3.1
## [34] glue_1.6.2 GenomeInfoDbData_1.2.9 V8_4.2.2
## [37] Rcpp_1.0.10 jquerylib_0.1.4 vctrs_0.6.2
## [40] Biostrings_2.66.0 nlme_3.1-162 crosstalk_1.2.0
## [43] iterators_1.0.14 xfun_0.37 stringr_1.5.0
## [46] openxlsx_4.2.5.2 lifecycle_1.0.3 XML_3.99-0.13
## [49] ca_0.71.1 edgeR_3.40.2 zlibbioc_1.44.0
## [52] scales_1.2.1 TSP_1.2-2 ragg_1.2.3
## [55] parallel_4.2.1 RColorBrewer_1.1-3 ComplexHeatmap_2.14.0
## [58] yaml_2.3.7 curl_5.0.0 memoise_2.0.1
## [61] sass_0.4.5 stringi_1.7.12 RSQLite_2.3.0
## [64] highr_0.10 randomcoloR_1.1.0.1 genefilter_1.80.3
## [67] foreach_1.5.2 seriation_1.4.1 zip_2.2.2
## [70] BiocParallel_1.32.5 shape_1.4.6 systemfonts_1.0.4
## [73] rlang_1.1.1 pkgconfig_2.0.3 bitops_1.0-7
## [76] evaluate_0.20 lattice_0.20-45 purrr_1.0.1
## [79] labeling_0.4.2 htmlwidgets_1.6.1 bit_4.0.5
## [82] tidyselect_1.2.0 magrittr_2.0.3 DESeq2_1.38.3
## [85] R6_2.5.1 generics_0.1.3 DelayedArray_0.24.0
## [88] DBI_1.1.3 mgcv_1.8-41 pillar_1.8.1
## [91] withr_2.5.0 survival_3.5-3 KEGGREST_1.38.0
## [94] RCurl_1.98-1.10 tibble_3.2.1 crayon_1.5.2
## [97] utf8_1.2.3 rmarkdown_2.20 GetoptLong_1.0.5
## [100] locfit_1.5-9.7 grid_4.2.1 sva_3.46.0
## [103] data.table_1.14.8 blob_1.2.3 digest_0.6.31
## [106] xtable_1.8-4 textshaping_0.3.6 munsell_0.5.0
## [109] viridisLite_0.4.1 registry_0.5-1 bslib_0.4.2
Date
## [1] "Tue Jul 22 18:57:59 2025"