-
Notifications
You must be signed in to change notification settings - Fork 36
/
make_10x_clones_file.py
494 lines (408 loc) · 19.8 KB
/
make_10x_clones_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
from basic import *
import parse_tsv
from collections import Counter
from itertools import chain
import sys
def show(tcr):
"For debugging"
if type( tcr[0] ) is str:
return ' '.join(tcr[:3])
else:
return ' '.join( show(x) for x in tcr )
# yes, this is very silly, should just add pandas dependency
def parse_csv_file( csvfile ):
header = None
all_info = []
for line in open(csvfile,'rU'):
l = parse_tsv.safely_split_csv_line( line[:-1] )
if header is None:
header = l
else:
all_info.append( dict( zip( header,l ) ) )
return header, all_info
def read_tcr_data(
organism,
contig_annotations_csvfile,
consensus_annotations_csvfile,
include_gammadelta = False,
allow_unknown_genes = False,
verbose = False
):
""" Parse tcr data, only taking 'productive' tcrs
Returns:
clonotype2tcrs, clonotype2barcodes
"""
from all_genes import all_genes
expected_gene_names = all_genes[organism].keys()
#from cdr3s_human import all_align_fasta
gene_suffix = '*01' # may not be used
# read the contig annotations-- map from clonotypes to barcodes
# barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id
# AAAGATGGTCTTCTCG-1,True,AAAGATGGTCTTCTCG-1_contig_1,True,695,TRB,TRBV5-1*01,TRBD2*02,TRBJ2-3*01,TRBC2*01,True,True,CASSPLAGYAADTQYF,TGCGCCAGCAGCCCCCTAGCGGGATACGCAGCAGATACGCAGTATTTT,9427,9,clonotype14,clonotype14_consensus_1
assert exists( contig_annotations_csvfile )
_, lines = parse_csv_file(contig_annotations_csvfile)
clonotype2barcodes = {}
clonotype2tcrs_backup = {} ## in case we dont have a consensus_annotations_csvfile
for l in lines:
bc = l['barcode']
clonotype = l['raw_clonotype_id']
if clonotype =='None':
if l['productive'] not in [ 'None','False' ]:
assert l['productive'] == 'True'
#print 'clonotype==None: unproductive?',l['productive']
continue
if clonotype not in clonotype2barcodes:
clonotype2barcodes[clonotype] = []
if bc in clonotype2barcodes[clonotype]:
pass
#print 'repeat barcode'
else:
clonotype2barcodes[clonotype].append( bc )
if not clonotype:
print 'empty clonotype id:', l
continue
assert clonotype
## experimenting here ########################################3
if l['productive'].lower() != 'true':
continue
if l['cdr3'].lower() == 'none' or l['cdr3_nt'].lower() == 'none':
continue
chain = l['chain']
if chain not in ['TRA','TRB']:
continue
ab = chain[2]
if clonotype not in clonotype2tcrs_backup:
clonotype2tcrs_backup[ clonotype ] = {'A':Counter(), 'B':Counter() }
# stolen from below
vg = l['v_gene']
if '*' not in vg:
vg += gene_suffix
if 'DV' in vg and vg not in expected_gene_names:
#print 'DV?',vg
vg = vg[:vg.index('DV')]+'/'+vg[vg.index('DV'):]
jg = l['j_gene']
if '*' not in jg:
jg += gene_suffix
if vg not in expected_gene_names:
print 'unrecognized V gene:', organism, vg
if not allow_unknown_genes:
continue
if jg not in expected_gene_names:
print 'unrecognized J gene:', organism, jg
if not allow_unknown_genes:
continue
#assert vg in all_align_fasta[organism]
#assert jg in all_align_fasta[organism]
tcr_chain = ( vg, jg, l['cdr3'], l['cdr3_nt'].lower() )
clonotype2tcrs_backup[clonotype][ab][tcr_chain] += int(l['umis'])
for id in clonotype2tcrs_backup:
for ab in 'AB':
for t1,count1 in clonotype2tcrs_backup[id][ab].iteritems():
for t2, count2 in clonotype2tcrs_backup[id][ab].iteritems():
if t2<=t1:continue
if t1[3] == t2[3]:
print 'repeat??', count1, count2, t1, t2
if consensus_annotations_csvfile is None:
clonotype2tcrs = clonotype2tcrs_backup
else:
## now read details on the individual chains for each clonotype
# ==> tcr/human/JCC176_TX2_TCR_consensus_annotations.csv <==
# clonotype_id,consensus_id,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis
# clonotype100,clonotype100_consensus_1,550,TRB,TRBV24-1*01,TRBD1*01,TRBJ2-7*01,TRBC2*01,True,True,CATSDPGQGGYEQYF,TGTGCCACCAGTGACCCCGGACAGGGAGGATACGAGCAGTACTTC,8957,9
assert exists(consensus_annotations_csvfile)
_, lines = parse_csv_file( consensus_annotations_csvfile )
## first get clonotypes with one alpha and one beta
clonotype2tcrs = {}
for l in lines:
if l['productive'] == 'True':
id = l['clonotype_id']
if id not in clonotype2tcrs:
# dictionaries mapping from tcr to umi-count
clonotype2tcrs[id] = { 'A':Counter(), 'B':Counter() } #, 'G':[], 'D': [] }
assert id in clonotype2barcodes
ch = l['chain']
if not ch.startswith('TR'):
print 'skipline:', consensus_annotations_csvfile, ch, l['v_gene'], l['j_gene']
continue
ab = ch[2]
if ab not in 'AB':
print 'skipline:', consensus_annotations_csvfile, ch, l['v_gene'], l['j_gene']
continue
vg = l['v_gene']
if '*' not in vg:
vg += gene_suffix
if 'DV' in vg and vg not in expected_gene_names:
#print 'DV?',vg
vg = vg[:vg.index('DV')]+'/'+vg[vg.index('DV'):]
jg = l['j_gene']
if '*' not in jg:
jg += gene_suffix
# if vg in tcr_gene_remap[organism]:
# vg = tcr_gene_remap[organism][vg]
# if jg in tcr_gene_remap[organism]:
# jg = tcr_gene_remap[organism][jg]
if vg not in expected_gene_names:
print 'unrecognized V gene:', organism, vg
if not allow_unknown_genes:
continue
if jg not in expected_gene_names:
print 'unrecognized J gene:', organism, jg
if not allow_unknown_genes:
continue
#assert vg in all_align_fasta[organism]
#assert jg in all_align_fasta[organism]
tcr_chain = ( vg, jg, l['cdr3'], l['cdr3_nt'].lower() )
if tcr_chain not in clonotype2tcrs[id][ab]:
umis = int( l['umis'] )
clonotype2tcrs[id][ab][ tcr_chain ] = umis
old_umis = clonotype2tcrs_backup[id][ab][tcr_chain]
if umis != old_umis:
print 'diff_umis:',umis, old_umis, id,ab,tcr_chain
else:
print 'repeat?',id,ab,tcr_chain
else:
if l['productive'] not in [ 'None','False' ]:
print 'unproductive?',l['productive']
if verbose:
idl1 = sorted( clonotype2tcrs_backup.keys())
idl2 = sorted( clonotype2tcrs.keys())
print 'same ids:', len(idl1), len(idl2), idl1==idl2
for id in clonotype2tcrs_backup:
if id in clonotype2tcrs:
for ab in 'AB':
tl1 = sorted(clonotype2tcrs_backup[id][ab].keys())
tl2 = sorted(clonotype2tcrs[id][ab].keys())
if tl1 != tl2:
print 'diffids:',id,ab,tl1,tl2
return clonotype2tcrs, clonotype2barcodes
def make_clones_file( organism, outfile, clonotype2tcrs, clonotype2barcodes ):
''' Make a clones file with information parsed from the 10X csv files
organism is one of ['mouse','human']
outfile is the name of the clones file to be created
'''
tmpfile = outfile+'.tmp' # a temporary intermediate file
bc_mapfile = outfile+'.barcode_mapping.tsv'
outmap = open(bc_mapfile,'w')
outmap.write('clone_id\tbarcodes\n')
outfields = 'clone_id subject clone_size va_gene ja_gene vb_gene jb_gene cdr3a cdr3a_nucseq cdr3b cdr3b_nucseq'\
.split()
extra_fields = 'alpha_umi beta_umi num_alphas num_betas'.split()
outfields += extra_fields
out = open(tmpfile,'w')
out.write('\t'.join( outfields )+'\n' )
for clonotype in sorted(clonotype2tcrs.keys()):
tcrs = clonotype2tcrs[clonotype]
if len(tcrs['A']) >= 1 and len(tcrs['B']) >= 1:
atcrs = tcrs['A'].most_common()
btcrs = tcrs['B'].most_common()
if len(atcrs)>1:
print 'multiple alphas, picking top umi:',' '.join( str(x) for _,x in atcrs )
if len(btcrs)>1:
print 'multiple betas, picking top umi:',' '.join( str(x) for _,x in btcrs )
atcr, atcr_umi = atcrs[0]
btcr, btcr_umi = btcrs[0]
outl = {}
outl['clone_id'] = clonotype
outl['subject'] = 'UNK_S'
outl['clone_size'] = len(clonotype2barcodes[clonotype])
outl['va_gene'] = atcr[0]
outl['ja_gene'] = atcr[1]
outl['cdr3a'] = atcr[2]
outl['cdr3a_nucseq'] = atcr[3]
outl['alpha_umi'] = str(atcr_umi)
outl['num_alphas'] = str(len(atcrs))
outl['vb_gene'] = btcr[0]
outl['jb_gene'] = btcr[1]
outl['cdr3b'] = btcr[2]
outl['cdr3b_nucseq'] = btcr[3]
outl['beta_umi'] = str(btcr_umi)
outl['num_betas'] = str(len(btcrs))
out.write( make_tsv_line(outl,outfields)+'\n' )
outmap.write('{}\t{}\n'.format(clonotype,','.join(clonotype2barcodes[clonotype])))
out.close()
outmap.close()
python_exe = sys.executable
cmd = '{} {}/file_converter.py --input_format clones --output_format clones --input_file {} --output_file {} --organism {} --clobber --epitope UNK_E --extra_fields {} '\
.format( python_exe, paths.path_to_scripts, tmpfile, outfile, organism, ' '.join(extra_fields) )
print cmd
system(cmd)
def setup_filtered_clonotype_dicts( clonotype2tcrs, clonotype2barcodes, min_repeat_count_fraction = 0.33):
''' returns new_clonotype2tcrs, new_clonotype2barcodes
'''
# get count of how many cells support each pairing
#
pairing_counts = Counter()
for cid, tcrs in clonotype2tcrs.iteritems():
size = len(clonotype2barcodes[cid])
for t1 in tcrs['A']:
for t2 in tcrs['B']:
pairing_counts[ (t1,t2) ] += size
# this is no longer going to be a bijection!
chain_partner = {}
valid_ab_pairings = set()
for ( (t1,t2), count ) in list(pairing_counts.most_common()): # make a copy for sanity
if t1 in chain_partner or t2 in chain_partner:
t1_p2 = chain_partner.get(t1,None)
t2_p2 = chain_partner.get(t2,None)
oldcount1 = pairing_counts[ (t1,t1_p2) ]
oldcount2 = pairing_counts[ (t2_p2,t2) ]
#print 'repeat:', count, oldcount1, oldcount2,\
# t1, t2, t1_p2, t2_p2
if count >= min_repeat_count_fraction * max(oldcount1,oldcount2):
# take it anyway -- second alpha or genuinely shared alpha?
print 'take_rep_partners:', count, oldcount1, oldcount2, t1, t2, t1_p2, t2_p2
# dont overwrite the old pairing... might not do either of these!
if t1 not in chain_partner:
chain_partner[t1] = t2
if t2 not in chain_partner:
chain_partner[t2] = t1
valid_ab_pairings.add( (t1, t2 ) )
else:
print 'skip_rep_partners:', count, oldcount1, oldcount2, t1, t2, t1_p2, t2_p2
else: # neither chain already in chain_partner
#
# NOTE: removed the code checking for TIES!!!!!!!!!!!
print 'norep_partners:', count, t1, t2
chain_partner[t1] = t2
chain_partner[t2] = t1
valid_ab_pairings.add( ( t1, t2 ) )
# now let's revisit the clonotypes
pairs_tuple2clonotypes = {}
ab_counts = Counter() # for diagnostics
for (clone_size, cid) in reversed( sorted( (len(y), x) for x,y in clonotype2barcodes.iteritems() ) ):
if cid not in clonotype2tcrs:
print 'WHOAH missing tcrs for clonotype', clone_size, cid, clonotype2barcodes[cid]
continue
tcrs = clonotype2tcrs[cid]
was_good_clone = len(tcrs['A']) >= 1 and len(tcrs['B']) >= 1
pairs = []
for atcr in tcrs['A']:
for btcr in tcrs['B']:
if ( atcr,btcr ) in valid_ab_pairings:
pairs.append( (atcr,btcr) )
if pairs:
pairs_tuple = None
if len(pairs)>1:
alphas = set( x[0] for x in pairs )
betas = set( x[1] for x in pairs )
ab_counts[ (len(alphas),len(betas) ) ] += clone_size
if len(alphas) == 2 and len(betas) == 1: ## only allow double-alphas
assert len(pairs) == 2
pairs_tuple = tuple(x[1] for x in reversed(sorted( [ (pairing_counts[x],x) for x in pairs ] )))
assert len( pairs_tuple)==2
# confirm ordered by counts
assert pairing_counts[pairs_tuple[0]] >= pairing_counts[pairs_tuple[1]]
else:
ab_counts[ (1,1) ] += clone_size
pairs_tuple = tuple(pairs)
assert len(pairs_tuple) == 1
if pairs_tuple:
pairs_tuple2clonotypes.setdefault( pairs_tuple, [] ).append( cid )
else:
print 'SKIPCLONE:', was_good_clone, cid, clone_size, pairs, 'bad_pairs'
else:
print 'SKIPCLONE:', was_good_clone, cid, clone_size, 'no_valid_pairs'
## reorder pairs_tuples in the case of ties, using umis
reorder = []
for pt, clonotypes in pairs_tuple2clonotypes.iteritems():
assert len(pt) in [1,2]
if len(pt) == 2:
assert pt[0][1] == pt[1][1] # same beta chain
at1, bt = pt[0]
at2, _ = pt[1]
count1, count2 = pairing_counts[(at1,bt)], pairing_counts[(at2,bt)]
assert count1 >= count2
if count1 == count2:
# no way to figure out which is the primary alpha!
# look at umis?
c1 = sum( clonotype2tcrs[x]['A'][at1] for x in clonotypes )
c2 = sum( clonotype2tcrs[x]['A'][at2] for x in clonotypes )
print 'alphatie:', count1, count2, c1, c2, show(at1), show(at2), show(bt)
if c2 > c1:
reorder.append( pt )
for pt in reorder:
print 'reorder:', show(pt)
assert len(pt) == 2
rpt = (pt[1], pt[0])
assert pt in pairs_tuple2clonotypes and rpt not in pairs_tuple2clonotypes
pairs_tuple2clonotypes[rpt] = pairs_tuple2clonotypes[pt][:]
del pairs_tuple2clonotypes[pt]
## look for len1 pairs_tuples that overlap with two different len2 pairs_tuples
merge_into_pairs = []
for pt1 in pairs_tuple2clonotypes:
if len(pt1) == 1:
overlaps = []
for pt2 in pairs_tuple2clonotypes:
if len(pt2) == 2 and pt2[0] == pt1[0]:
overlaps.append( pt2 )
if len(overlaps)>1:
print 'badoverlaps:', len(overlaps), show(pt1), show(overlaps)
elif len(overlaps)==1:
pt2 = overlaps[0]
merge_into_pairs.append( (pt1,pt2) )
for pt1, pt2 in merge_into_pairs:
print 'mergeinto:', show(pt1), show(pt2)
pairs_tuple2clonotypes[pt2].extend( pairs_tuple2clonotypes[pt1] )
del pairs_tuple2clonotypes[pt1]
## look for pairs_tuples that will give the same clones file line
for pt1, clonotypes in pairs_tuple2clonotypes.iteritems():
for pt2 in pairs_tuple2clonotypes:
if pt1 < pt2 and pt1[0] == pt2[0]:
print 'overlap:', len(pt1), len(pt2), pt1, pt2
## now setup new clonotype2tcrs, clonotype2barcodes mappings
new_clonotype2tcrs = {}
new_clonotype2barcodes = {}
for pairs_tuple, clonotypes in pairs_tuple2clonotypes.iteritems():
c0 = clonotypes[0]
if len(clonotypes)>1:
print 'merging:', ' '.join(clonotypes)
tcrs = {'A':Counter(), 'B':Counter()}
for (atcr,btcr) in pairs_tuple:
tcrs['A'][atcr] += pairing_counts[(atcr, btcr)]
tcrs['B'][btcr] += pairing_counts[(atcr, btcr)]
if len(pairs_tuple)==2:
a1, a2 = pairs_tuple[0][0], pairs_tuple[1][0]
if tcrs['A'][a1] == tcrs['A'][a2]:
tcrs['A'][a1] += 1 # ensure the desired order
else:
assert tcrs['A'][a1] > tcrs['A'][a2]
assert len(tcrs['A']) in [1,2]
assert len(tcrs['B']) == 1
assert c0 not in new_clonotype2tcrs
new_clonotype2tcrs[c0] = tcrs
new_clonotype2barcodes[c0] = list(chain( *(clonotype2barcodes[x] for x in clonotypes)))
# print 'new:', new_clonotype2barcodes[c0]
# for c in clonotypes:
# print 'old:', clonotype2barcodes[c]
# print len(new_clonotype2barcodes[c0]), sum( len(clonotype2barcodes[x]) for x in clonotypes)
assert len(new_clonotype2barcodes[c0]) == sum( len(clonotype2barcodes[x]) for x in clonotypes)
print 'ab_counts:', ab_counts.most_common()
old_good_clonotypes = [ x for x,y in clonotype2tcrs.iteritems() if len(y['A']) >= 1 and len(y['B']) >= 1 ]
old_num_barcodes = sum( len(clonotype2barcodes[x]) for x in old_good_clonotypes )
new_num_barcodes = sum( len(x) for x in new_clonotype2barcodes.values() )
print 'old_num_barcodes:', old_num_barcodes, 'new_num_barcodes:', new_num_barcodes
return new_clonotype2tcrs, new_clonotype2barcodes
#######################################################################################################
#######################################################################################################
#######################################################################################################
#######################################################################################################
#######################################################################################################
#######################################################################################################
#######################################################################################################
if __name__ == '__main__':
with Parser(locals()) as p:
p.str('filtered_contig_annotations_csvfile').shorthand('f').required()
p.str('consensus_annotations_csvfile').shorthand('c')
p.str('clones_file').shorthand('o').required()
p.str('organism').required()
p.flag('clobber')
p.flag('stringent')
if exists(clones_file) and not clobber:
print 'ERROR -- clones_file {} already exists; use --clobber to overwrite'
exit(1)
assert organism in ['human','mouse']
clonotype2tcrs, clonotype2barcodes = read_tcr_data( organism, filtered_contig_annotations_csvfile,
consensus_annotations_csvfile )
if stringent:
clonotype2tcrs, clonotype2barcodes = setup_filtered_clonotype_dicts( clonotype2tcrs, clonotype2barcodes )
make_clones_file( organism, clones_file, clonotype2tcrs, clonotype2barcodes )