diff --git a/NEWS.md b/NEWS.md index 94ccfb9..4237c4f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# Version 0.9.9 + +- Updated NCBI gene information +- Added `cds_start` to GENCODE transcripts + # Version 0.9.7 - fixed bug in Entrez gene identifiers (grch37) diff --git a/R/sysdata.rda b/R/sysdata.rda index 685ba1c..91d5c81 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/data-raw/custom_gene_aliases.tsv b/data-raw/custom_gene_aliases.tsv index a148da3..f3e50f0 100644 --- a/data-raw/custom_gene_aliases.tsv +++ b/data-raw/custom_gene_aliases.tsv @@ -15,6 +15,7 @@ KIT c-Kit 3815 KIT c-kit 3815 MYC c-MYC 4609 MTOR mTOR 2475 +RPTOR RAPTOR 57521 MYCN N-MYC 4613 ROS1 ROS-1 6098 CD274 B7-H1 29126 diff --git a/data-raw/data-raw.R b/data-raw/data-raw.R index f2411e9..81bec69 100644 --- a/data-raw/data-raw.R +++ b/data-raw/data-raw.R @@ -270,7 +270,7 @@ for(vbump in c('major','minor','patch')){ bump_version_level <- "patch" #version_bump <- version_bumps[[bump_version_level]] -version_bump <- "0.9.7" +version_bump <- "0.9.9" gd_records <- list() db_id_ref <- data.frame() diff --git a/data-raw/utils_gencode_annotation.R b/data-raw/utils_gencode_annotation.R index f5d3a08..50e6baf 100644 --- a/data-raw/utils_gencode_annotation.R +++ b/data-raw/utils_gencode_annotation.R @@ -74,7 +74,21 @@ gencode_get_transcripts <- #https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.chr_patch_hapl_scaff.annotation.gtf.gz options(timeout = 10000000) - gencode_gtf_transcripts <- valr::read_gtf(remote_gtf_all) |> + gencode_gtf_all <- valr::read_gtf(remote_gtf_all) + + cds_start_positions <- gencode_gtf_all |> + dplyr::filter(.data$type == "start_codon" & phase == 0) |> + dplyr::mutate(cds_start = dplyr::if_else( + .data$strand == "-", + as.numeric(.data$end), + as.numeric(.data$start) + )) |> + dplyr::select(c("transcript_id", "cds_start")) |> + dplyr::rename( + ensembl_transcript_id_full = "transcript_id") |> + dplyr::distinct() + + gencode_gtf_transcripts <- gencode_gtf_all |> dplyr::filter(type == "transcript") if(gencode_version == 19){ @@ -147,7 +161,11 @@ gencode_get_transcripts <- )) |> dplyr::filter(!is.na(ensembl_gene_id) & !is.na(ensembl_transcript_id)) |> - dplyr::distinct() + dplyr::distinct() |> + dplyr::left_join( + cds_start_positions, + by = "ensembl_transcript_id_full" + ) # lgr::lgr$info(paste0( @@ -309,6 +327,7 @@ gencode_get_transcripts <- end, transcript_start, transcript_end, + cds_start, strand, ensembl_gene_id, ensembl_gene_id_full,