1. Environment setting

library(GenomicRanges)
library(plyranges)
library(biomaRt)
library(ChIPseeker)
library(org.Hs.eg.db)
library(annotatr)

Making txdb object from GTF file

OutputFolder <- params$OutputFolder

GTFFile <- "~/DataDir/3.TwistBedAnn/Input/gencode.v35.annotation.gtf.gz"
txdb_v35 <- GenomicFeatures::makeTxDbFromGFF(GTFFile, format="gtf")
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type stop_codon. This information was
## ignored.
## OK

Importing bed file with regions targeted by Twist kit

TwistBed <- params$TwistBed
Twistbed <- as.data.frame(read.table(TwistBed,header = FALSE, sep="\t",stringsAsFactors=FALSE, quote=""))

colnames(Twistbed) <- c("chr", "start", "end", "ann")

Remember that position form bedGraph files are 0-based while R works with 1-based position and therefore also the packages we will go to use, this is the reason why I set starts.in.df.are.0based = TRUE in creating the GRanges objects, it converts 0-based to 1-based coordinates.

Twist_Granges <- makeGRangesFromDataFrame(Twistbed, keep.extra.columns = TRUE, 
                                                start.field = "start", end.field = "end", starts.in.df.are.0based = TRUE)

2. Genetic annotation and visualization with `ChIPseeker`

DMRs are now annotated to their associated genes and genomic regions using the annotatePeaks function of the ChIPseekerR package, using a TxDb object. The GenomicFeatures package uses TxDb objects to store transcript metadata. This class maps the 5’ and 3’ untranslated regions (UTRs), protein coding sequences (CDSs) and exons for a set of mRNA transcripts to their associated genome. All TxDb objects are backed by a SQLite database that manages genomic locations and the relationships between pre-processed mRNA transcripts, exons, protein coding sequences, and their related gene identifiers.

peakAnno <- annotatePeak(Twist_Granges, TxDb=txdb_v35, tssRegion=c(-3000, 3000), verbose=TRUE, addFlankGeneInfo = TRUE) 
## >> preparing features information...      2024-09-27 06:56:08 PM 
## >> identifying nearest features...        2024-09-27 06:56:08 PM 
## >> calculating distance from peak to TSS...   2024-09-27 06:56:13 PM 
## >> assigning genomic annotation...        2024-09-27 06:56:13 PM 
## >> adding flank feature information from peaks...     2024-09-27 06:56:36 PM 
## >> assigning chromosome lengths           2024-09-27 06:59:09 PM 
## >> done...                    2024-09-27 06:59:09 PM

plotAnnoPie(peakAnno)

plotAnnoBar(peakAnno)

vennpie(peakAnno)

plotDistToTSS(peakAnno,
              title="Distribution of transcription factor-binding loci\nrelative to TSS")

3. Gene Annotation with biomaRt

Host <- "https://aug2020.archive.ensembl.org"
Specie <- "hsapiens"
BioMart <- biomaRt::listMarts(host=Host)[1,1]
Version <- biomaRt::listMarts(host=Host)[1,2]
Mart <- biomaRt::useMart(host=Host, biomart=BioMart, version=Version, 
                         dataset=paste0(Specie,'_gene_ensembl'))

GeneVector = unique(as.data.frame(peakAnno)$geneId) #Genes assigned to the regions by chipseeker

Attributes = c('ensembl_gene_id', 'ensembl_gene_id_version', 'hgnc_symbol', 'external_gene_name', 'gene_biotype',
               'description', 'chromosome_name', 'start_position', 'end_position')

GeneAnnotation <- biomaRt::getBM(mart=Mart, filters='ensembl_gene_id_version', values=GeneVector, attributes=Attributes, uniqueRows=TRUE)

GeneAnnotation <- GeneAnnotation[GeneAnnotation$chromosome_name %in% c(1:22, "X", "Y", "MT"),]
GeneAnnotation <- GeneAnnotation %>% dplyr::filter(!duplicated(ensembl_gene_id_version))  

if(length(unique(GeneAnnotation$ensembl_gene_id_version)) != length(GeneAnnotation$ensembl_gene_id_version)){
  stop('ERROR: not-unique EnsemblID in annotation!')
} 

GeneVector[!GeneVector %in% GeneAnnotation$ensembl_gene_id_version] #ChipSeeker annotation that are not present in the biomart
##  [1] "ENSG00000228572.7_PAR_Y"  "ENSG00000182378.14_PAR_Y" "ENSG00000178605.13_PAR_Y" "ENSG00000226179.6_PAR_Y"  "ENSG00000167393.18_PAR_Y"
##  [6] "ENSG00000275287.5_PAR_Y"  "ENSG00000280767.3_PAR_Y"  "ENSG00000234958.6_PAR_Y"  "ENSG00000229232.6_PAR_Y"  "ENSG00000185960.14_PAR_Y"
## [11] "ENSG00000225661.7_PAR_Y"  "ENSG00000205755.12_PAR_Y" "ENSG00000198223.16_PAR_Y" "ENSG00000185291.12_PAR_Y" "ENSG00000169100.14_PAR_Y"
## [16] "ENSG00000236871.7_PAR_Y"  "ENSG00000236017.8_PAR_Y"  "ENSG00000169093.16_PAR_Y" "ENSG00000182162.11_PAR_Y" "ENSG00000197976.12_PAR_Y"
## [21] "ENSG00000196433.13_PAR_Y" "ENSG00000223511.7_PAR_Y"  "ENSG00000234622.6_PAR_Y"  "ENSG00000169084.15_PAR_Y" "ENSG00000223571.6_PAR_Y" 
## [26] "ENSG00000214717.12_PAR_Y" "ENSG00000277120.5_PAR_Y"  "ENSG00000223773.7_PAR_Y"  "ENSG00000230542.6_PAR_Y"  "ENSG00000002586.20_PAR_Y"
## [31] "ENSG00000237801.6_PAR_Y"  "ENSG00000237040.6_PAR_Y"  "ENSG00000124333.16_PAR_Y" "ENSG00000228410.6_PAR_Y"  "ENSG00000124334.17_PAR_Y"
## [36] "ENSG00000270726.6_PAR_Y"

AnnotationFinal <- dplyr::left_join(as.data.frame(peakAnno), GeneAnnotation, by=c('geneId' = 'ensembl_gene_id_version')) 

AnnotationFinal <- AnnotationFinal %>% rename("ensembl_gene_id_version"="geneId")

Here I save gene info

Genes_df <- AnnotationFinal[, c("ensembl_gene_id_version", "ensembl_gene_id", "hgnc_symbol", "gene_biotype")] %>% dplyr::distinct()%>%filter(!is.na(hgnc_symbol))%>%filter(!hgnc_symbol%in%"")
rownames(Genes_df) <- NULL

length(Genes_df$hgnc_symbol)
## [1] 36016

length(unique(Genes_df$hgnc_symbol))
## [1] 36011

Genes_df[Genes_df$hgnc_symbol%in%Genes_df[duplicated(Genes_df$hgnc_symbol), "hgnc_symbol"],]
##       ensembl_gene_id_version ensembl_gene_id hgnc_symbol                       gene_biotype
## 3309        ENSG00000285053.1 ENSG00000285053        TBCE                     protein_coding
## 3310        ENSG00000284770.2 ENSG00000284770        TBCE                     protein_coding
## 20449       ENSG00000237940.3 ENSG00000237940   LINC01238                             lncRNA
## 20451       ENSG00000261186.2 ENSG00000261186   LINC01238                             lncRNA
## 21924      ENSG00000206195.11 ENSG00000206195      DUXAP8                             lncRNA
## 21926       ENSG00000271672.1 ENSG00000271672      DUXAP8   transcribed_processed_pseudogene
## 24581       ENSG00000284862.3 ENSG00000284862      CCDC39                     protein_coding
## 24582      ENSG00000145075.13 ENSG00000145075      CCDC39                             lncRNA
## 30308       ENSG00000272655.2 ENSG00000272655     POLR2J4 transcribed_unprocessed_pseudogene
## 30309       ENSG00000214783.9 ENSG00000214783     POLR2J4                             lncRNA

4. Saving and session info

saveRDS(AnnotationFinal, paste0(OutputFolder, "TwistAnnotated.rds"))

date()

## [1] "Fri Sep 27 18:59:23 2024"

sessionInfo()

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8   
##  [6] LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] annotatr_1.24.0             org.Hs.eg.db_3.16.0         AnnotationDbi_1.60.2        ChIPseeker_1.34.1           biomaRt_2.54.1             
##  [6] plyranges_1.18.0            scales_1.2.1                plotly_4.10.2               DSS_2.46.0                  BiocParallel_1.32.6        
## [11] methylSig_1.10.0            dmrseq_1.18.1               DT_0.28                     data.table_1.14.8           tidyr_1.3.0                
## [16] dplyr_1.1.2                 ggrepel_0.9.3               ggplot2_3.4.2               bsseq_1.34.0                SummarizedExperiment_1.28.0
## [21] Biobase_2.58.0              MatrixGenerics_1.10.0       matrixStats_1.0.0           GenomicRanges_1.50.2        GenomeInfoDb_1.34.9        
## [26] IRanges_2.32.0              S4Vectors_0.36.2            BiocGenerics_0.44.0        
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3                              R.utils_2.12.2                          tidyselect_1.2.0                       
##   [4] RSQLite_2.3.1                           htmlwidgets_1.6.2                       grid_4.2.1                             
##   [7] scatterpie_0.1.8                        munsell_0.5.0                           codetools_0.2-19                       
##  [10] withr_2.5.0                             colorspace_2.1-0                        GOSemSim_2.24.0                        
##  [13] filelock_1.0.2                          highr_0.10                              knitr_1.43                             
##  [16] rstudioapi_0.15.0                       DOSE_3.24.2                             labeling_0.4.2                         
##  [19] GenomeInfoDbData_1.2.9                  polyclip_1.10-4                         bit64_4.0.5                            
##  [22] farver_2.1.1                            rhdf5_2.42.1                            treeio_1.23.1                          
##  [25] vctrs_0.6.3                             generics_0.1.3                          xfun_0.39                              
##  [28] BiocFileCache_2.6.1                     regioneR_1.30.0                         R6_2.5.1                               
##  [31] graphlayouts_0.8.4                      locfit_1.5-9.7                          gridGraphics_0.5-1                     
##  [34] bitops_1.0-7                            rhdf5filters_1.10.1                     cachem_1.0.8                           
##  [37] fgsea_1.24.0                            DelayedArray_0.24.0                     promises_1.2.0.1                       
##  [40] BiocIO_1.8.0                            ggraph_2.1.0                            enrichplot_1.18.4                      
##  [43] gtable_0.3.3                            tidygraph_1.2.3                         rlang_1.1.1                            
##  [46] splines_4.2.1                           rtracklayer_1.58.0                      lazyeval_0.2.2                         
##  [49] BiocManager_1.30.25                     yaml_2.3.7                              reshape2_1.4.4                         
##  [52] GenomicFeatures_1.50.4                  crosstalk_1.2.0                         httpuv_1.6.11                          
##  [55] qvalue_2.30.0                           tools_4.2.1                             ggplotify_0.1.0                        
##  [58] gplots_3.1.3                            ellipsis_0.3.2                          jquerylib_0.1.4                        
##  [61] RColorBrewer_1.1-3                      Rcpp_1.0.11                             plyr_1.8.8                             
##  [64] sparseMatrixStats_1.10.0                progress_1.2.2                          zlibbioc_1.44.0                        
##  [67] purrr_1.0.1                             RCurl_1.98-1.12                         prettyunits_1.1.1                      
##  [70] viridis_0.6.2                           cowplot_1.1.1                           bumphunter_1.40.0                      
##  [73] magrittr_2.0.3                          patchwork_1.1.2                         hms_1.1.3                              
##  [76] mime_0.12                               evaluate_0.21                           xtable_1.8-4                           
##  [79] HDO.db_0.99.1                           XML_3.99-0.14                           gridExtra_2.3                          
##  [82] compiler_4.2.1                          tibble_3.2.1                            KernSmooth_2.23-22                     
##  [85] shadowtext_0.1.2                        crayon_1.5.2                            R.oo_1.25.0                            
##  [88] htmltools_0.5.5                         ggfun_0.0.9                             later_1.3.1                            
##  [91] tzdb_0.4.0                              aplot_0.1.10                            DBI_1.1.3                              
##  [94] tweenr_2.0.2                            dbplyr_2.3.3                            MASS_7.3-60                            
##  [97] rappdirs_0.3.3                          boot_1.3-28.1                           Matrix_1.6-0                           
## [100] readr_2.1.4                             permute_0.9-7                           cli_3.6.1                              
## [103] R.methodsS3_1.8.2                       igraph_1.5.0                            pkgconfig_2.0.3                        
## [106] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 GenomicAlignments_1.34.1                xml2_1.3.5                             
## [109] foreach_1.5.2                           ggtree_3.6.2                            bslib_0.5.0                            
## [112] rngtools_1.5.2                          XVector_0.38.0                          yulab.utils_0.0.6                      
## [115] doRNG_1.8.6                             stringr_1.5.0                           digest_0.6.33                          
## [118] Biostrings_2.66.0                       rmarkdown_2.23                          fastmatch_1.1-3                        
## [121] tidytree_0.4.2                          DelayedMatrixStats_1.20.0               restfulr_0.0.15                        
## [124] curl_5.0.1                              shiny_1.7.4.1                           Rsamtools_2.14.0                       
## [127] gtools_3.9.4                            rjson_0.2.21                            lifecycle_1.0.3                        
## [130] nlme_3.1-162                            outliers_0.15                           jsonlite_1.8.7                         
## [133] Rhdf5lib_1.20.0                         viridisLite_0.4.2                       limma_3.54.2                           
## [136] BSgenome_1.66.3                         fansi_1.0.4                             pillar_1.9.0                           
## [139] lattice_0.21-8                          plotrix_3.8-2                           KEGGREST_1.38.0                        
## [142] fastmap_1.1.1                           httr_1.4.6                              GO.db_3.16.0                           
## [145] interactiveDisplayBase_1.36.0           glue_1.6.2                              png_0.1-8                              
## [148] iterators_1.0.14                        BiocVersion_3.16.0                      bit_4.0.5                              
## [151] ggforce_0.4.1                           stringi_1.7.12                          sass_0.4.7                             
## [154] HDF5Array_1.26.0                        blob_1.2.4                              AnnotationHub_3.6.0                    
## [157] caTools_1.18.2                          memoise_2.0.1                           renv_1.0.9                             
## [160] ape_5.7-1

Genomic Annotation of Twist bed file with target regions

Gaja Matassa

Date: September 27, 2024

1. Environment setting

2. Genetic annotation and visualization with `ChIPseeker`

3. Gene Annotation with biomaRt

4. Saving and session info

Genomic Annotation of Twist bed file with target regions

Gaja Matassa

Date: September 27, 2024

1. Environment setting

2. Genetic annotation and visualization with ChIPseeker

3. Gene Annotation with biomaRt

4. Saving and session info

2. Genetic annotation and visualization with `ChIPseeker`