-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_genet.jl
226 lines (159 loc) · 8.1 KB
/
parse_genet.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
using DataFrames, CSV, Distributions, HDF5, LinearAlgebra
##read in the reference file and subset to a specific chromosome
function parse_ref(ref_file, chrom)
print("... parse reference file: $ref_file ...")
df = DataFrame(CSV.File(ref_file))
ref_dict = filter(:CHR => n -> n == chrom, df)
n_snp = nrow(ref_dict)
print("... $n_snp SNPs on chromosome $chrom read from $ref_file ...")
return ref_dict
end
function parse_bim(bim_file, chrom)
print("... parse bim file: $bim_file.bim ...")
df = DataFrame(CSV.File("$bim_file.bim", header = false))
#Assign appropriate names to the columns
#like, yes, this doesn't check to see if your column order is valid but also:
# 1. this is a script, not a production grade system
# 2. If i were a user i would simply have a correctly formatted bim file
rename!(df, [:CHR, :SNP, :POS ,:BP, :A1 , :A2])
#select SNPs on our chromosome of interest
vld_dict = select(filter(:CHR => n -> n == chrom, df), :SNP, :A1,:A2)
n_snp = nrow(vld_dict)
print("... $n_snp SNPs on chromosome $chrom read from $bim_file.bim ...")
return vld_dict
end
#function to plug into filter to make sure both allelles are valid
function valid_allelles(a1, a2)::Bool
valid_a1 = a1 in ["A","C","G","T"]
valid_a2 = a2 in ["A","C","G","T"]
valid_a1 && valid_a2
end
#function to generate valid pairs for allelles
#TODO: find a clever way to refactor this
function pair_allelles(a)
a2 = Vector{String}()
for i in 1:length(a)
if a[i] == "T"
push!(a2, "A")
end
if a[i] == "A"
push!(a2, "T")
end
if a[i] == "G"
push!(a2, "C")
end
if a[i] == "C"
push!(a2, "G")
end
end
return a2
end
function parse_sumstats(ref_dict, vld_dict, sst_file, n_subj)
print("... parse sumstats file: $sst_file ...")
#filter for valid allelles--no real need to split this across two lines
df = filter([:A1, :A2] => valid_allelles, DataFrame(CSV.File(sst_file)))
#SNPs and allelles in our GWAS summary statistics
sst_dict = select(df, :SNP, :A1, :A2)
##our "truth" is the validation dict, which has what we think is the right SNP/a1/a2
##here, for each snp we create dictionaries for A1/A2, A2/A1, A1/(correct pairing for A1),A2/correct pairing for A2
##We could just merge on "SNP" and take the .bim ordering--by not doing so,
## we imply that some of the snp/a1/a2 orderings in the bim file don't
## appear in the 4 sets of combinations we see above. I don't know how realistic this is
ref_snp = [select(ref_dict, :SNP,:A1,:A2);
select(rename(ref_dict, Dict(:A1 => :A2, :A2 => :A1)), :SNP,:A1,:A2);
DataFrame(SNP = ref_dict[:,:SNP], A1 = ref_dict[:,:A1], A2 = pair_allelles(ref_dict[:,:A1]));
DataFrame(SNP = ref_dict[:,:SNP],A1 = pair_allelles(ref_dict[:,:A2]), A2 = ref_dict[:,:A2])]
sst_snp = [select(sst_dict, :SNP,:A1,:A2);
select(rename(sst_dict, Dict(:A1 => :A2, :A2 => :A1)), :SNP,:A1,:A2);
DataFrame(SNP = sst_dict[:,:SNP], A1 = sst_dict[:,:A1], A2 = pair_allelles(sst_dict[:,:A1]));
DataFrame(SNP = sst_dict[:,:SNP],A1 = pair_allelles(sst_dict[:,:A2]), A2 = sst_dict[:,:A2])]
comm_snp = innerjoin(innerjoin(vld_dict, ref_snp, on = [:SNP, :A1, :A2]),sst_snp, on = [:SNP, :A1, :A2])
n_snp = nrow(comm_snp)
#convert the common snp dict to a vector of tuples
comm_snp = Tuple.(eachrow(comm_snp))
print("... $n_snp common SNPs in the reference, sumstats, and validation set ...")
#N subjects in GWAS
n_sqrt = sqrt(n_subj)
df[!,:A1_map] = pair_allelles(df[:,:A1])
df[!,:A2_map] = pair_allelles(df[:,:A2])
df[!, :BETA_STD] .= 0.0
Ndist = Normal(0,1)
if "OR" in names(df)
df[!, :BETA] = log.(df[:,:OR])
end
#for each SNP/(A1,A2) pairing in the sumstats file, see if it appears in our list of
# valid SNP choices
# would just be much faster to see if the sst snps are in the valiation dict
# this is to get flipped SNPs (probably not the most efficient way)
in_comm1 = Tuple.(eachrow(DataFrame(SNP = df[:,:SNP], A1 = df[:,:A1], A2 = df[:,:A2]))) .∈ Ref(comm_snp)
in_comm2 = Tuple.(eachrow(DataFrame(SNP = df[:,:SNP], A1 = df[:,:A1_map], A2 = df[:,:A2_map]))) .∈ Ref(comm_snp)
##standardize beta coefficients
df[in_comm1 .| in_comm2, :BETA_STD] = sign.(df[in_comm1 .| in_comm2, :BETA]) .* abs.(quantile.(Ndist, max.(df[in_comm1 .| in_comm2, :P]./2, 1e-323))) ./ n_sqrt
#standardize beta coefficients for flipped paits
in_comm3 = Tuple.(eachrow(DataFrame(SNP = df[:,:SNP], A1 = df[:,:A2], A2 = df[:,:A1]))) .∈ Ref(comm_snp)
in_comm4 = Tuple.(eachrow(DataFrame(SNP = df[:,:SNP], A1 = df[:,:A2_map], A2 = df[:,:A1_map]))) .∈ Ref(comm_snp)
df[in_comm3 .| in_comm4, :BETA_STD] = -1 .* sign.(df[in_comm3 .| in_comm4, :BETA]) .* abs.(quantile.(Ndist, max.(df[in_comm3 .| in_comm4, :P]./2, 1e-323))) ./ n_sqrt
#subset to valid SNPs
df = df[in_comm1 .| in_comm4 .| in_comm2 .| in_comm3,:]
#subset the reference LD dict to valid SNPs
in_sst = ref_dict[:,:SNP] .∈ Ref(df[:,:SNP])
#get the information from the reference dict and join it to the effect sizes
sst_dict = innerjoin(select(ref_dict, :CHR, :SNP, :BP,:A1,:A2, :MAF), select(df, :SNP, :BETA_STD), on = :SNP)
sst_dict[!,:A1_map] = pair_allelles(sst_dict[:,:A1])
sst_dict[!,:A2_map] = pair_allelles(sst_dict[:,:A2])
sst_dict[!,:tmp] = copy(sst_dict[:,:A2])
sst_dict[!, :FLP] .= 0
#detect flipped allelle sets
in_comm1 = Tuple.(eachrow(DataFrame(SNP = sst_dict[:,:SNP], A1 = sst_dict[:,:A1], A2 = sst_dict[:,:A2]))) .∈ Ref(comm_snp)
in_comm2 = Tuple.(eachrow(DataFrame(SNP = sst_dict[:,:SNP], A1 = sst_dict[:,:A1_map], A2 = sst_dict[:,:A2_map]))) .∈ Ref(comm_snp)
in_comm3 = Tuple.(eachrow(DataFrame(SNP = sst_dict[:,:SNP], A1 = sst_dict[:,:A2], A2 = sst_dict[:,:A1]))) .∈ Ref(comm_snp)
in_comm4 = Tuple.(eachrow(DataFrame(SNP = sst_dict[:,:SNP], A1 = sst_dict[:,:A2_map], A2 = sst_dict[:,:A1_map]))) .∈ Ref(comm_snp)
sst_dict[in_comm2,:A1] = sst_dict[in_comm2,:A1_map]
sst_dict[in_comm2,:A2] = sst_dict[in_comm2,:A2_map]
sst_dict[in_comm1 .| in_comm2, :FLP] .= 1
sst_dict[in_comm3,:tmp] = sst_dict[in_comm3,:A1]
sst_dict[in_comm3,:A1] = sst_dict[in_comm3,:A2]
sst_dict[in_comm3,:A2] = sst_dict[in_comm3,:tmp]
sst_dict[in_comm4,:A1] = sst_dict[in_comm2,:A2_map]
sst_dict[in_comm4,:A2] = sst_dict[in_comm2,:A1_map]
sst_dict[in_comm3 .| in_comm4, :FLP] .= -1
sst_dict[in_comm3 .| in_comm4, :MAF] = 1. .- sst_dict[in_comm3 .| in_comm4, :MAF]
sst_dict[!,:BETA] = sst_dict[:, :BETA_STD]
ret_df = select(sst_dict, :CHR, :SNP, :BP, :A1, :A2, :MAF, :BETA, :FLP)
return ret_df
end
function parse_ldblk(ldblk_dir, sst_dict, chrom)
print("... parse reference LD on chromosome $chrom ...")
if occursin("1kg", ldblk_dir)
chr_name = "$ldblk_dir/ldblk_1kg_chr$chrom.hdf5"
end
if occursin("ukbb", ldblk_dir)
chr_name = "$ldblk_dir/ldblk_ukbb_chr$chrom.hdf5"
end
hdf_chr = h5open(chr_name, "r")
n_blk = length(hdf_chr)
###NOTE: THIS LD BLOCK IS NOT THE SAME AS THE PYTHON VERSION--check block 1
ld_blk = [read(hdf_chr["blk_$i"]["ldblk"]) for i in 1:n_blk]
snp_blk = [read(hdf_chr["blk_$i"]["snplist"]) for i in 1:n_blk]
idx_blks = [[j for j in 1:length(i) if i[j] in sst_dict[:,:SNP]] for i in snp_blk]
blk_size = length.(idx_blks)
mm = 0
for i in 1:n_blk
if idx_blks[i] != []
flip = sst_dict[(mm+1):(mm + blk_size[i]), :FLP]
#negative correlations for flipped alleles
ld_blk[i] = ld_blk[i][idx_blks[i], idx_blks[i]] .* (flip * flip')
U, S, V = svd(ld_blk[i])
##reconstruct the matrix, take the average of the reconstructed values and
##original
h = V*Diagonal(S)*V'
ld_blk[i] = (ld_blk[i] .+ h) ./ 2
mm = mm + blk_size[i]
end
#empty blocks
if idx_blks[i] == []
ld_blk[i] = Array{Float64}(undef, 0,0)
end
end
return ld_blk, blk_size
end