-
Notifications
You must be signed in to change notification settings - Fork 0
/
geneID_converter.r
157 lines (142 loc) · 6.03 KB
/
geneID_converter.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# geneID_converter.r
# Ashley Conard
# Last Modified: Aug. 9, 2019
# Resource: function get.symbolIDsDm TAKEN FROM https://www.researchgate.net/publication/308990864_R_function_getsymbolIDsDm_to_convert_Uniprot_Flybase_etc_to_gene_symbol
# args = commandArgs(trailingOnly=TRUE)
# if (length(args)==0) {
# stop("Type: /usr/local/bin/Rscript clusterProfiler.r /PATH/TO/INPUT_OUTPUT_DIR/ (Pass in Input/Output Directory - will be the same - e.g. /Users/ashleymaeconard/Desktop/RESULTS/Feb4_min5_clusters/) OVERLAP_NOT_EXPERIMENTS (set to 1 to run overlap comparison, 0 otherwise)", call.=FALSE)
# } else if (length(args) == 2) {
# cat("Passed in:", args,"\n")
# } else{
# stop("Pass in 1) Input/Output Directory (e.g. /Users/ashleymaeconard/Desktop/RESULTS/Feb4_min5_clusters/) 2) OVERLAP_NOT_EXPERIMENTS (set to 1 to run overlap comparison, 0 otherwise)")
# }
get.symbolIDsDm <- function(id,id.type){
# ************************************************
# get.symbolIDs function programmed by
# Benjamin Tovar | February 25, 2014
# http://tata-box-blog.blogspot.de/2014/02/convert-ensembl-unigene-uniprot-and.html
# modified to get.symbolIDsDm function to work with Drosophila melanogaster by
# Christoph Metzendorf | October 11, 2016
# INSTRUCTIONS
# id = vector of the original IDs, for example:
# c("FBgn0040373","FBgn0040372","FBgn0261446" )
# id.type = type of the original IDs, in the example is ensembl
# NOTE: only refseq, ensembl, uniprot, unigene and flybase are supported,
# this function depends on the Bioconductor package org.Dm.eg.db
# that can be installed:
# source("http://bioconductor.org/biocLite.R")
# biocLite("org.Dm.eg.db")
# ************************************************
# # USAGE EXAMPlE: ENSEMBL
# require(org.Dm.eg.db)
# ensembl <- toTable(org.Dm.egENSEMBL)
# id <- ensembl[1:100,2]
# id.type <- "ensembl"
# res <- get.symbolIDsDm(id,id.type)
# # USAGE EXAMPlE: UNIPROT
# require(org.Dm.eg.db)
# uniprot <- toTable(org.Dm.egUNIPROT)
# id <- uniprot[1:100,2]
# id.type <- "uniprot"
# res <- get.symbolIDsDm(id,id.type)
# # USAGE EXAMPlE: REFSEQ
# require(org.Dm.eg.db)
# refseq.id <- toTable(org.Dm.egREFSEQ)
# id <- refseq.id[1:100,2]
# id.type <- "refseq"
# res <- get.symbolIDsDm(id,id.type)
# # USAGE EXAMPlE: UNIGENE
# require(org.Dm.eg.db)
# unigene <- toTable(org.Dm.egUNIGENE)
# id <- unigene[1:100,2]
# id.type <- "unigene"
# res <- get.symbolIDsDm(id,id.type)
# # USAGE EXAMPlE: FLYBASE
# require(org.Dm.eg.db)
# flybase <- toTable(org.Dm.egFLYBASE)
# id <- flybase[1:100,2]
# id.type <- "flybase"
# res <- get.symbolIDsDm(id,id.type)
# LOAD THE ANNOTATION INFORMATION
cat("Note: Running function get.symbolIDsDm from Christoph Metzendorf.")
cat("1) Loading annotation library",date(),"\n")
require(org.Dm.eg.db)
cat("2) Loading annotation for symbol IDs",date(),"\n")
symbol <- toTable(org.Dm.egSYMBOL)
# IF THE ORIGINALS IDS = ENSEMBL
if(id.type=="ensembl"){
cat("3) Loading annotation for ensembl IDs",date(),"\n")
annotation <- toTable(org.Dm.egENSEMBL)
# extract the indexes of the original database
index <- as.numeric(sapply(id, function(x) which(annotation[,2]==x)[1]))
# extract the gene_ids
gene_id.index <- as.numeric(annotation[index,1])
# parse the indexes back to the symbol IDs
index <- as.numeric(sapply(gene_id.index, function(x) which(symbol[,1]==x)))
# extract the IDs
symbolIDs <- symbol[index,2]
# export the output
return(symbolIDs)
}
# IF THE ORIGINALS IDS = UNIPROT
if(id.type=="uniprot"){
cat("3) Loading annotation for uniprot IDs",date(),"\n")
annotation <- toTable(org.Dm.egUNIPROT)
# extract the indexes of the original database
index <- as.numeric(sapply(id, function(x) which(annotation[,2]==x)[1]))
# extract the gene_ids
gene_id.index <- as.numeric(annotation[index,1])
# parse the indexes back to the symbol IDs
index <- as.numeric(sapply(gene_id.index, function(x) which(symbol[,1]==x)))
# extract the IDs
symbolIDs <- symbol[index,2]
# export the output
return(symbolIDs)
}
# IF THE ORIGINALS IDS = REFSEQ
if(id.type=="refseq"){
cat("3) Loading annotation for refseq IDs",date(),"\n")
annotation <- toTable(org.Dm.egREFSEQ)
# extract the indexes of the original database
index <- as.numeric(sapply(id, function(x) which(annotation[,2]==x)[1]))
# extract the gene_ids
gene_id.index <- as.numeric(annotation[index,1])
# parse the indexes back to the symbol IDs
index <- as.numeric(sapply(gene_id.index, function(x) which(symbol[,1]==x)))
# extract the IDs
symbolIDs <- symbol[index,2]
# export the output
return(symbolIDs)
}
# IF THE ORIGINALS IDS = UNIGENE
if(id.type=="unigene"){
cat("3) Loading annotation for unigene IDs",date(),"\n")
annotation <- toTable(org.Dm.egUNIGENE)
# extract the indexes of the original database
index <- as.numeric(sapply(id, function(x) which(annotation[,2]==x)[1]))
# extract the gene_ids
gene_id.index <- as.numeric(annotation[index,1])
# parse the indexes back to the symbol IDs
index <- as.numeric(sapply(gene_id.index, function(x) which(symbol[,1]==x)))
# extract the IDs
symbolIDs <- symbol[index,2]
# export the output
return(symbolIDs)
}
if(id.type=="flybase"){
cat("3) Loading annotation for flybase IDs",date(),"\n")
annotation <- toTable(org.Dm.egFLYBASE)
# extract the indexes of the original database
index <- as.numeric(sapply(id, function(x) which(annotation[,2]==x)[1]))
# extract the gene_ids
gene_id.index <- as.numeric(annotation[index,1])
# parse the indexes back to the symbol IDs
index <- as.numeric(sapply(gene_id.index, function(x) which(symbol[,1]==x)))
# extract the IDs
symbolIDs <- symbol[index,2]
# export the output
return(symbolIDs)
}
cat("** ERROR: DATABASE TYPE NOT SELECTED | TRY: ensembl OR unigene OR uniprot OR refseq **",date(),"\n")
return(NA)
}