library(GenomicRanges)
library(plyranges)
library(biomaRt)
library(ChIPseeker)
library(org.Hs.eg.db)
library(annotatr)
Making txdb object from GTF file
<- params$OutputFolder
OutputFolder
<- "~/DataDir/3.TwistBedAnn/Input/gencode.v35.annotation.gtf.gz"
GTFFile <- GenomicFeatures::makeTxDbFromGFF(GTFFile, format="gtf")
txdb_v35 ## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type stop_codon. This information was
## ignored.
## OK
Importing bed file with regions targeted by Twist kit
<- params$TwistBed
TwistBed <- as.data.frame(read.table(TwistBed,header = FALSE, sep="\t",stringsAsFactors=FALSE, quote="")) Twistbed
colnames(Twistbed) <- c("chr", "start", "end", "ann")
Remember that position form bedGraph files are 0-based while R works
with 1-based position and therefore also the packages we will go to use,
this is the reason why I set starts.in.df.are.0based = TRUE
in creating the GRanges objects, it converts 0-based to 1-based
coordinates.
<- makeGRangesFromDataFrame(Twistbed, keep.extra.columns = TRUE,
Twist_Granges start.field = "start", end.field = "end", starts.in.df.are.0based = TRUE)
ChIPseeker
DMRs are now annotated to their associated genes and genomic regions
using the annotatePeaks
function of the
ChIPseeker
R package, using a TxDb object. The GenomicFeatures
package uses TxDb
objects to store transcript metadata.
This class maps the 5’ and 3’ untranslated regions (UTRs), protein
coding sequences (CDSs) and exons for a set of mRNA transcripts to their
associated genome. All TxDb
objects are backed by a SQLite
database that manages genomic locations and the relationships between
pre-processed mRNA transcripts, exons, protein coding sequences, and
their related gene identifiers.
<- annotatePeak(Twist_Granges, TxDb=txdb_v35, tssRegion=c(-3000, 3000), verbose=TRUE, addFlankGeneInfo = TRUE)
peakAnno ## >> preparing features information... 2024-09-27 06:56:08 PM
## >> identifying nearest features... 2024-09-27 06:56:08 PM
## >> calculating distance from peak to TSS... 2024-09-27 06:56:13 PM
## >> assigning genomic annotation... 2024-09-27 06:56:13 PM
## >> adding flank feature information from peaks... 2024-09-27 06:56:36 PM
## >> assigning chromosome lengths 2024-09-27 06:59:09 PM
## >> done... 2024-09-27 06:59:09 PM
plotAnnoPie(peakAnno)
plotAnnoBar(peakAnno)
vennpie(peakAnno)
plotDistToTSS(peakAnno,
title="Distribution of transcription factor-binding loci\nrelative to TSS")
<- "https://aug2020.archive.ensembl.org"
Host <- "hsapiens"
Specie <- biomaRt::listMarts(host=Host)[1,1]
BioMart <- biomaRt::listMarts(host=Host)[1,2]
Version <- biomaRt::useMart(host=Host, biomart=BioMart, version=Version,
Mart dataset=paste0(Specie,'_gene_ensembl'))
= unique(as.data.frame(peakAnno)$geneId) #Genes assigned to the regions by chipseeker
GeneVector
= c('ensembl_gene_id', 'ensembl_gene_id_version', 'hgnc_symbol', 'external_gene_name', 'gene_biotype',
Attributes 'description', 'chromosome_name', 'start_position', 'end_position')
<- biomaRt::getBM(mart=Mart, filters='ensembl_gene_id_version', values=GeneVector, attributes=Attributes, uniqueRows=TRUE) GeneAnnotation
<- GeneAnnotation[GeneAnnotation$chromosome_name %in% c(1:22, "X", "Y", "MT"),]
GeneAnnotation <- GeneAnnotation %>% dplyr::filter(!duplicated(ensembl_gene_id_version))
GeneAnnotation
if(length(unique(GeneAnnotation$ensembl_gene_id_version)) != length(GeneAnnotation$ensembl_gene_id_version)){
stop('ERROR: not-unique EnsemblID in annotation!')
}
!GeneVector %in% GeneAnnotation$ensembl_gene_id_version] #ChipSeeker annotation that are not present in the biomart
GeneVector[## [1] "ENSG00000228572.7_PAR_Y" "ENSG00000182378.14_PAR_Y" "ENSG00000178605.13_PAR_Y" "ENSG00000226179.6_PAR_Y" "ENSG00000167393.18_PAR_Y"
## [6] "ENSG00000275287.5_PAR_Y" "ENSG00000280767.3_PAR_Y" "ENSG00000234958.6_PAR_Y" "ENSG00000229232.6_PAR_Y" "ENSG00000185960.14_PAR_Y"
## [11] "ENSG00000225661.7_PAR_Y" "ENSG00000205755.12_PAR_Y" "ENSG00000198223.16_PAR_Y" "ENSG00000185291.12_PAR_Y" "ENSG00000169100.14_PAR_Y"
## [16] "ENSG00000236871.7_PAR_Y" "ENSG00000236017.8_PAR_Y" "ENSG00000169093.16_PAR_Y" "ENSG00000182162.11_PAR_Y" "ENSG00000197976.12_PAR_Y"
## [21] "ENSG00000196433.13_PAR_Y" "ENSG00000223511.7_PAR_Y" "ENSG00000234622.6_PAR_Y" "ENSG00000169084.15_PAR_Y" "ENSG00000223571.6_PAR_Y"
## [26] "ENSG00000214717.12_PAR_Y" "ENSG00000277120.5_PAR_Y" "ENSG00000223773.7_PAR_Y" "ENSG00000230542.6_PAR_Y" "ENSG00000002586.20_PAR_Y"
## [31] "ENSG00000237801.6_PAR_Y" "ENSG00000237040.6_PAR_Y" "ENSG00000124333.16_PAR_Y" "ENSG00000228410.6_PAR_Y" "ENSG00000124334.17_PAR_Y"
## [36] "ENSG00000270726.6_PAR_Y"
<- dplyr::left_join(as.data.frame(peakAnno), GeneAnnotation, by=c('geneId' = 'ensembl_gene_id_version'))
AnnotationFinal
<- AnnotationFinal %>% rename("ensembl_gene_id_version"="geneId") AnnotationFinal
Here I save gene info
<- AnnotationFinal[, c("ensembl_gene_id_version", "ensembl_gene_id", "hgnc_symbol", "gene_biotype")] %>% dplyr::distinct()%>%filter(!is.na(hgnc_symbol))%>%filter(!hgnc_symbol%in%"")
Genes_df rownames(Genes_df) <- NULL
length(Genes_df$hgnc_symbol)
## [1] 36016
length(unique(Genes_df$hgnc_symbol))
## [1] 36011
$hgnc_symbol%in%Genes_df[duplicated(Genes_df$hgnc_symbol), "hgnc_symbol"],]
Genes_df[Genes_df## ensembl_gene_id_version ensembl_gene_id hgnc_symbol gene_biotype
## 3309 ENSG00000285053.1 ENSG00000285053 TBCE protein_coding
## 3310 ENSG00000284770.2 ENSG00000284770 TBCE protein_coding
## 20449 ENSG00000237940.3 ENSG00000237940 LINC01238 lncRNA
## 20451 ENSG00000261186.2 ENSG00000261186 LINC01238 lncRNA
## 21924 ENSG00000206195.11 ENSG00000206195 DUXAP8 lncRNA
## 21926 ENSG00000271672.1 ENSG00000271672 DUXAP8 transcribed_processed_pseudogene
## 24581 ENSG00000284862.3 ENSG00000284862 CCDC39 protein_coding
## 24582 ENSG00000145075.13 ENSG00000145075 CCDC39 lncRNA
## 30308 ENSG00000272655.2 ENSG00000272655 POLR2J4 transcribed_unprocessed_pseudogene
## 30309 ENSG00000214783.9 ENSG00000214783 POLR2J4 lncRNA
saveRDS(AnnotationFinal, paste0(OutputFolder, "TwistAnnotated.rds"))
date()
## [1] "Fri Sep 27 18:59:23 2024"
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
## [6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] annotatr_1.24.0 org.Hs.eg.db_3.16.0 AnnotationDbi_1.60.2 ChIPseeker_1.34.1 biomaRt_2.54.1
## [6] plyranges_1.18.0 scales_1.2.1 plotly_4.10.2 DSS_2.46.0 BiocParallel_1.32.6
## [11] methylSig_1.10.0 dmrseq_1.18.1 DT_0.28 data.table_1.14.8 tidyr_1.3.0
## [16] dplyr_1.1.2 ggrepel_0.9.3 ggplot2_3.4.2 bsseq_1.34.0 SummarizedExperiment_1.28.0
## [21] Biobase_2.58.0 MatrixGenerics_1.10.0 matrixStats_1.0.0 GenomicRanges_1.50.2 GenomeInfoDb_1.34.9
## [26] IRanges_2.32.0 S4Vectors_0.36.2 BiocGenerics_0.44.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 R.utils_2.12.2 tidyselect_1.2.0
## [4] RSQLite_2.3.1 htmlwidgets_1.6.2 grid_4.2.1
## [7] scatterpie_0.1.8 munsell_0.5.0 codetools_0.2-19
## [10] withr_2.5.0 colorspace_2.1-0 GOSemSim_2.24.0
## [13] filelock_1.0.2 highr_0.10 knitr_1.43
## [16] rstudioapi_0.15.0 DOSE_3.24.2 labeling_0.4.2
## [19] GenomeInfoDbData_1.2.9 polyclip_1.10-4 bit64_4.0.5
## [22] farver_2.1.1 rhdf5_2.42.1 treeio_1.23.1
## [25] vctrs_0.6.3 generics_0.1.3 xfun_0.39
## [28] BiocFileCache_2.6.1 regioneR_1.30.0 R6_2.5.1
## [31] graphlayouts_0.8.4 locfit_1.5-9.7 gridGraphics_0.5-1
## [34] bitops_1.0-7 rhdf5filters_1.10.1 cachem_1.0.8
## [37] fgsea_1.24.0 DelayedArray_0.24.0 promises_1.2.0.1
## [40] BiocIO_1.8.0 ggraph_2.1.0 enrichplot_1.18.4
## [43] gtable_0.3.3 tidygraph_1.2.3 rlang_1.1.1
## [46] splines_4.2.1 rtracklayer_1.58.0 lazyeval_0.2.2
## [49] BiocManager_1.30.25 yaml_2.3.7 reshape2_1.4.4
## [52] GenomicFeatures_1.50.4 crosstalk_1.2.0 httpuv_1.6.11
## [55] qvalue_2.30.0 tools_4.2.1 ggplotify_0.1.0
## [58] gplots_3.1.3 ellipsis_0.3.2 jquerylib_0.1.4
## [61] RColorBrewer_1.1-3 Rcpp_1.0.11 plyr_1.8.8
## [64] sparseMatrixStats_1.10.0 progress_1.2.2 zlibbioc_1.44.0
## [67] purrr_1.0.1 RCurl_1.98-1.12 prettyunits_1.1.1
## [70] viridis_0.6.2 cowplot_1.1.1 bumphunter_1.40.0
## [73] magrittr_2.0.3 patchwork_1.1.2 hms_1.1.3
## [76] mime_0.12 evaluate_0.21 xtable_1.8-4
## [79] HDO.db_0.99.1 XML_3.99-0.14 gridExtra_2.3
## [82] compiler_4.2.1 tibble_3.2.1 KernSmooth_2.23-22
## [85] shadowtext_0.1.2 crayon_1.5.2 R.oo_1.25.0
## [88] htmltools_0.5.5 ggfun_0.0.9 later_1.3.1
## [91] tzdb_0.4.0 aplot_0.1.10 DBI_1.1.3
## [94] tweenr_2.0.2 dbplyr_2.3.3 MASS_7.3-60
## [97] rappdirs_0.3.3 boot_1.3-28.1 Matrix_1.6-0
## [100] readr_2.1.4 permute_0.9-7 cli_3.6.1
## [103] R.methodsS3_1.8.2 igraph_1.5.0 pkgconfig_2.0.3
## [106] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 GenomicAlignments_1.34.1 xml2_1.3.5
## [109] foreach_1.5.2 ggtree_3.6.2 bslib_0.5.0
## [112] rngtools_1.5.2 XVector_0.38.0 yulab.utils_0.0.6
## [115] doRNG_1.8.6 stringr_1.5.0 digest_0.6.33
## [118] Biostrings_2.66.0 rmarkdown_2.23 fastmatch_1.1-3
## [121] tidytree_0.4.2 DelayedMatrixStats_1.20.0 restfulr_0.0.15
## [124] curl_5.0.1 shiny_1.7.4.1 Rsamtools_2.14.0
## [127] gtools_3.9.4 rjson_0.2.21 lifecycle_1.0.3
## [130] nlme_3.1-162 outliers_0.15 jsonlite_1.8.7
## [133] Rhdf5lib_1.20.0 viridisLite_0.4.2 limma_3.54.2
## [136] BSgenome_1.66.3 fansi_1.0.4 pillar_1.9.0
## [139] lattice_0.21-8 plotrix_3.8-2 KEGGREST_1.38.0
## [142] fastmap_1.1.1 httr_1.4.6 GO.db_3.16.0
## [145] interactiveDisplayBase_1.36.0 glue_1.6.2 png_0.1-8
## [148] iterators_1.0.14 BiocVersion_3.16.0 bit_4.0.5
## [151] ggforce_0.4.1 stringi_1.7.12 sass_0.4.7
## [154] HDF5Array_1.26.0 blob_1.2.4 AnnotationHub_3.6.0
## [157] caTools_1.18.2 memoise_2.0.1 renv_1.0.9
## [160] ape_5.7-1