Skip to content

Commit

Permalink
adding cds_start
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Sep 13, 2024
1 parent d5c4cb2 commit c2ebb1c
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 3 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Version 0.9.9

- Updated NCBI gene information
- Added `cds_start` to GENCODE transcripts

# Version 0.9.7

- fixed bug in Entrez gene identifiers (grch37)
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
1 change: 1 addition & 0 deletions data-raw/custom_gene_aliases.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ KIT c-Kit 3815
KIT c-kit 3815
MYC c-MYC 4609
MTOR mTOR 2475
RPTOR RAPTOR 57521
MYCN N-MYC 4613
ROS1 ROS-1 6098
CD274 B7-H1 29126
Expand Down
2 changes: 1 addition & 1 deletion data-raw/data-raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ for(vbump in c('major','minor','patch')){

bump_version_level <- "patch"
#version_bump <- version_bumps[[bump_version_level]]
version_bump <- "0.9.7"
version_bump <- "0.9.9"

gd_records <- list()
db_id_ref <- data.frame()
Expand Down
23 changes: 21 additions & 2 deletions data-raw/utils_gencode_annotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,21 @@ gencode_get_transcripts <-
#https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.chr_patch_hapl_scaff.annotation.gtf.gz

options(timeout = 10000000)
gencode_gtf_transcripts <- valr::read_gtf(remote_gtf_all) |>
gencode_gtf_all <- valr::read_gtf(remote_gtf_all)

cds_start_positions <- gencode_gtf_all |>
dplyr::filter(.data$type == "start_codon" & phase == 0) |>
dplyr::mutate(cds_start = dplyr::if_else(
.data$strand == "-",
as.numeric(.data$end),
as.numeric(.data$start)
)) |>
dplyr::select(c("transcript_id", "cds_start")) |>
dplyr::rename(
ensembl_transcript_id_full = "transcript_id") |>
dplyr::distinct()

gencode_gtf_transcripts <- gencode_gtf_all |>
dplyr::filter(type == "transcript")

if(gencode_version == 19){
Expand Down Expand Up @@ -147,7 +161,11 @@ gencode_get_transcripts <-
)) |>
dplyr::filter(!is.na(ensembl_gene_id) &
!is.na(ensembl_transcript_id)) |>
dplyr::distinct()
dplyr::distinct() |>
dplyr::left_join(
cds_start_positions,
by = "ensembl_transcript_id_full"
)

#
lgr::lgr$info(paste0(
Expand Down Expand Up @@ -309,6 +327,7 @@ gencode_get_transcripts <-
end,
transcript_start,
transcript_end,
cds_start,
strand,
ensembl_gene_id,
ensembl_gene_id_full,
Expand Down

0 comments on commit c2ebb1c

Please sign in to comment.