diff --git a/scripts/check_circ_type.py b/scripts/check_circ_type.py deleted file mode 100644 index 9a32014..0000000 --- a/scripts/check_circ_type.py +++ /dev/null @@ -1,69 +0,0 @@ -import sys -import re - -igtf = sys.argv[1] -circf = sys.argv[2] - -tid2exons = [] -tids = [] - -with open(igtf) as gf: - pre_tid = '' - i = -1 - for l in gf: - ll = l.strip().split() - if l[0] == '#' or ll[2] != 'exon': - continue - ch = ll[0] - st = ll[3] - end = ll[4] - tid = ll[13][1:-2] - #print ch,st,end,tid - - if tid != pre_tid: - i += 1 - tid2exons.append([]) - tid2exons[i].append((ch, st, end)) - tids.append(tid) - else: - tid2exons[i].append((ch, st, end)) - - pre_tid = tid - -print >>sys.stderr, 'Finished loading GTF, loaded {} transcripts. {}'.format(len(tid2exons), len(tids)) - -with open(circf) as cf: - for l in cf: - ll = l.strip().split() - ch = ll[0] - st = '{}'.format(int(ll[1]) + 1) - end = ll[2] - tid = ll[6].split('.')[0] - - ind = -1 - for i in range(len(tid2exons)): - if tids[i] == tid: - ind = i - - if ind < 0: - print >>sys.stderr, 'tid {} not found!'.format(tid) - - st_found = False - en_found = False - for e in tid2exons[ind]: - (ech, est, een) = e - if ech == ch and est == st: - st_found = True - if ech == ch and een == end: - en_found = True - - if st_found and en_found: - typ = "BothIn" - elif st_found or en_found: - typ = "OneEnd" - else: - typ = "None" - - print '{}\t{}\t{}\t{}'.format(ch, st, end, typ) - - diff --git a/scripts/convert_ciri.py b/scripts/circ_simulation_eval/convert_ciri.py similarity index 100% rename from scripts/convert_ciri.py rename to scripts/circ_simulation_eval/convert_ciri.py diff --git a/scripts/find_TP.py b/scripts/circ_simulation_eval/find_TP.py similarity index 100% rename from scripts/find_TP.py rename to scripts/circ_simulation_eval/find_TP.py diff --git a/scripts/missed_circ_classify.py b/scripts/circ_simulation_eval/missed_circ_classify.py similarity index 100% rename from scripts/missed_circ_classify.py rename to scripts/circ_simulation_eval/missed_circ_classify.py diff --git a/scripts/parse_simul_log.py b/scripts/circ_simulation_eval/parse_simul_log.py similarity index 100% rename from scripts/parse_simul_log.py rename to scripts/circ_simulation_eval/parse_simul_log.py diff --git a/scripts/read_dist.py b/scripts/circ_simulation_eval/read_dist.py similarity index 100% rename from scripts/read_dist.py rename to scripts/circ_simulation_eval/read_dist.py diff --git a/scripts/simulator_log_parser.py b/scripts/circ_simulation_eval/simulator_log_parser.py similarity index 100% rename from scripts/simulator_log_parser.py rename to scripts/circ_simulation_eval/simulator_log_parser.py diff --git a/scripts/compare_circExplorer.py b/scripts/compare_circExplorer.py deleted file mode 100644 index 32335ef..0000000 --- a/scripts/compare_circExplorer.py +++ /dev/null @@ -1,60 +0,0 @@ -import sys - -my_report = sys.argv[1] -ce_report = sys.argv[2] -out_name = sys.argv[3] - -my_calls = {} -ce_calls = {} - -def load_my_file(my_report): - with open(my_report) as mf: - for l in mf: - ll = l.strip().split() - chrom = ll[0] - spos = int(ll[1]) - 1 - epos = int(ll[2]) - support = int(ll[3]) - - my_calls[(chrom, spos, epos)] = support - -def load_ce_file(ce_report): - with open(ce_report) as ce: - for l in ce: - ll = l.strip().split() - chrom = ll[0] - spos = int(ll[1]) - epos = int(ll[2]) - lll = ll[3].split('/') - support = int(lll[1]) - #support = int(ll[3]) - - ce_calls[(chrom, spos, epos)] = support - -# c1 - c2 -def compare(c1, c2, outf): - for k in c1.keys(): - if k not in c2.keys(): - (c, s, e) = k - print >>outf, '{}\t{}\t{}\t{}'.format(c, s, e, c1[k]) - -def union(c1, c2, outf): - for k in c1.keys(): - if k in c2.keys(): - (c, s, e) = k - print >>outf, '{}\t{}\t{}\t{}\t{}'.format(c, s, e, c1[k], c2[k]) - -load_my_file(my_report) -load_ce_file(ce_report) - -us_adv = open('{}.us.adv'.format(out_name), 'w') -compare(my_calls, ce_calls, us_adv) -us_adv.close() - -ce_adv = open('{}.ce.adv'.format(out_name), 'w') -compare(ce_calls, my_calls, ce_adv) -ce_adv.close() - -both = open('{}.both'.format(out_name), 'w') -union(my_calls, ce_calls, both) -both.close() diff --git a/scripts/compare_multiple_tools/Makefile b/scripts/compare_multiple_tools/Makefile new file mode 100644 index 0000000..1f9d9e3 --- /dev/null +++ b/scripts/compare_multiple_tools/Makefile @@ -0,0 +1,25 @@ +all: OPTIMIZE_FLAGS build +build: cleanexe mydiff +clean: cleanobj + +CXX ?= g++ + +INCS := +CXXFLAGS := -Wall $(INCS) -std=c++11 +LDFLAGS := + +MYOBJ = diff.o + + +mydiff: $(MYOBJ) + $(CXX) -w $(MYOBJ) -o $@ ${LDFLAGS} ${LIBS} + +cleanobj: + @rm -f $(MYOBJ) + +cleanexe: + @rm -f mydiff + +OPTIMIZE_FLAGS: + $(eval CXXFLAGS = $(CXXFLAGS) -O3) + diff --git a/scripts/compare_multiple_tools/diff.cpp b/scripts/compare_multiple_tools/diff.cpp new file mode 100644 index 0000000..59973fd --- /dev/null +++ b/scripts/compare_multiple_tools/diff.cpp @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include +#include + +using namespace std; + +void separate_by_comma(string s, vector & v) { + v.clear(); + if (s.length() == 0) + return; + + std::stringstream ss(s); + + while (ss.good()) { + string substr; + getline(ss, substr, ','); + v.push_back(substr); + } +} + +// b: before +// a: after +void get_difference(string b, string a) { + vector vb; + vector va; + + separate_by_comma(b, vb); + separate_by_comma(a, va); + + sort(vb.begin(), vb.end()); + sort(va.begin(), va.end()); + + vector b_a(vb.size()); + vector a_b(va.size()); + + auto it_b_a = std::set_difference(vb.begin(), vb.end(), va.begin(), va.end(), b_a.begin()); + auto it_a_b = std::set_difference(va.begin(), va.end(), vb.begin(), vb.end(), a_b.begin()); + + b_a.resize(it_b_a - b_a.begin()); + a_b.resize(it_a_b - a_b.begin()); + + cout << "\t"; + if (b_a.begin() == b_a.end()) + cout << "-"; + for (auto it = b_a.begin(); it != b_a.end(); ++it) + cout << *it << ","; + + cout << "\t"; + if (a_b.begin() == a_b.end()) + cout << "-"; + for (auto it = a_b.begin(); it != a_b.end(); ++it) + cout << *it << ","; +} + +int main(int argc, char** argv) { + --argc; + + if (argc < 2) + cerr << "Usage: " << argv[0] << " FILE1 FILE2 [FILE3 FILE4]\n"; + + ifstream fin1; + ifstream fin2; + ifstream fin3; + ifstream fin4; + + fin1.open(argv[1]); + fin2.open(argv[2]); + fin3.open(argv[3]); + fin4.open(argv[4]); + + string a, b, c, rs; + int d; + map > mymap; + map > rsmap; + while (fin1 >> a >> b >> c >> d >> rs) { + mymap[a+'-'+b+'-'+c] = {d, 0, 0, 0}; + rsmap[a+'-'+b+'-'+c] = {rs, ""}; + } + + while (fin2 >> a >> b >> c >> d >> rs) { + if (mymap.find(a+'-'+b+'-'+c) != mymap.end()) { + mymap[a+'-'+b+'-'+c][1] = d; + rsmap[a+'-'+b+'-'+c].second = rs; + } + else { + mymap[a+'-'+b+'-'+c] = {0, d, 0, 0}; + rsmap[a+'-'+b+'-'+c] = {"", rs}; + } + } + + while (fin3 >> a >> b >> c >> d) { + if (mymap.find(a+'-'+b+'-'+c) != mymap.end()) { + mymap[a+'-'+b+'-'+c][2] = d; + } + } + + int ce2b; + while (fin4 >> a >> ce2b >> c >> d) { + b = to_string(ce2b+1); + if (mymap.find(a+'-'+b+'-'+c) != mymap.end()) { + mymap[a+'-'+b+'-'+c][3] = d; + } + } + + map >::iterator it; + int increased = 0, decreased = 0, not_changed = 0, disapeared = 0, newly_added = 0; + + auto rsit = rsmap.begin(); + for (auto it = mymap.begin(); it != mymap.end(); ++it) { + int bb = it->second[0]; + int aa = it->second[1]; + + int ciri = it->second[2]; + int ce2 = it->second[3]; + + if (bb - aa == 0) { + not_changed++; + cout << "[NC]\t"; + } + else if (bb < aa) { + if (bb == 0) { + newly_added++; + cout << "[NA]\t"; + } + else { + increased++; + cout << "[++]\t"; + } + } + else { + if (aa == 0) { + disapeared++; + cout << "[DI]\t"; + } + else { + decreased++; + cout << "[--]\t"; + } + } + + cout << it->first << "\t" << bb << "\t" << aa; + if (argc > 2) + cout << "\t" << ciri << "\t" << ce2; + + get_difference(rsit->second.first, rsit->second.second); + + cout << endl; + + ++rsit; + + } + + fin1.close(); + fin2.close(); + fin3.close(); + fin4.close(); + + return 0; +} diff --git a/scripts/compare_sam.py b/scripts/compare_sam.py deleted file mode 100644 index 1b31365..0000000 --- a/scripts/compare_sam.py +++ /dev/null @@ -1,107 +0,0 @@ -import sys - -sam_fname = sys.argv[1] -myout_fname = sys.argv[2] - -SAMSIZE = 40000000 -LOCDIFTH = 10 - -sam_locs = [ 0, 0 ] -sam_locs[0] = [ ] -sam_locs[1] = [ ] - -exact_locs = [ ] -scat = [ ] - -for i in range(SAMSIZE): - sam_locs[0].append([]) - sam_locs[1].append([]) - exact_locs.append(-1) - scat.append(0) - -is_r1 = 0 -read = 0 - -print "Loading sam file..." -with open(sam_fname) as sf: - for l in sf: - ll = l.strip().split() - rname = int(ll[0].split('.')[1]) - chrom = ll[2] - if chrom == "*": - continue - - read += 1 - if (read % 200000) == 0: - print "# {} lines".format(read) - flag = int(ll[1]) - spos = int(ll[3]) - - cat = ll[-1] - scat[rname] = int(cat[-1]) - - is_r1 = (flag & 64) / 64 - sam_locs[is_r1][rname].append( (chrom, spos) ) - -#print "R1: ", sam_locs[1] -#print "R2: ", sam_locs[0] - -print "Size: ", len(sam_locs[0]) - sam_locs[0].count([]), len(sam_locs[1]) - sam_locs[1].count([]) - -def closest_vacinity(chrom, p, l): - return min( [ abs (x - p) for (ch, x) in l if ch == chrom ] ) - -# score of each mate: -# 4: exact match -# 1: in vacinity -# 0: not matched -def set_score(chrom, p, l): - cl = 2 * LOCDIFTH - - try: - cl = closest_vacinity(chrom, p, l) - except ValueError: - cl = LOCDIFTH + 1 - - if cl == 0: #Exact match - return 4 - elif cl <= LOCDIFTH: - return 1 - else: - return 0 - -def check_loc(rname, chrom, p1, p2): - s1 = 0 - s2 = 0 - - s1 += set_score(chrom, p1, sam_locs[1][rname]) - s1 += set_score(chrom, p2, sam_locs[0][rname]) - - s2 += set_score(chrom, p2, sam_locs[1][rname]) - s2 += set_score(chrom, p1, sam_locs[0][rname]) - - return max(s1, s2) - -read = 0 -print "Loading my output file..." -with open(myout_fname) as mf: - for l in mf: - read += 1 - if (read % 200000) == 0: - print "# {} reads".format(read) - - ll = l.strip().split() - rname = int(ll[0].split('.')[1]) - chrom = ll[1] - pos1 = int(ll[2]) - pos2 = int(ll[10]) - mtype = int(ll[-1]) - - exact_locs[rname] = max(exact_locs[rname], check_loc(rname, chrom, pos1, pos2)) - -print "Printing results..." -for rname in range(len(exact_locs)): - if rname % 100000 == 0: - print "# {} reads".format(rname) - if exact_locs[rname] != -1: - print>> sys.stderr, rname, exact_locs[rname], "XC:i:{}".format(scat[rname]) diff --git a/scripts/count_exon.py b/scripts/count_exon.py deleted file mode 100644 index 18cf495..0000000 --- a/scripts/count_exon.py +++ /dev/null @@ -1,27 +0,0 @@ -import sys - -gtf_fname = sys.argv[1] - -genes = {} -enums = [] - -max_enum = 0 - -with open(gtf_fname) as gf: - for l in gf: - if l[0] == '#': - continue - ll = l.strip().split() - if ll[2] == "gene": - enums.append(max_enum) - max_enum = 0 - if ll[2] != "exon": - continue - exon_num = int(ll[13][1:-2]) - max_enum = max(max_enum, exon_num) - -enums.append(max_enum) - -print enums -print len(enums) -print sum(enums) diff --git a/scripts/count_n.py b/scripts/count_n.py deleted file mode 100644 index 36b3300..0000000 --- a/scripts/count_n.py +++ /dev/null @@ -1,25 +0,0 @@ -import sys - -seq_fname = sys.argv[1] - -high_th = 50 - -with open(seq_fname) as sf: - rname = '?' - seq = '' - highn = 0 - for l in sf: - if l[0] == '>': - if rname != l[1:]: - if rname != '?': - print '{}\t{}\t{}\t{}'.format(rname.strip(), len(seq), seq.count('N'), 100.0 * seq.count('N') / len(seq)) - if 100.0 * seq.count('N') / len(seq) >= high_th: - highn += 1 - rname = l[1:] - seq = '' - continue - seq += l.strip() - print '{}\t{}\t{}\t{}'.format(rname.strip(), len(seq), seq.count('N'), 100.0 * seq.count('N') / len(seq)) - if 100.0 * seq.count('N') / len(seq) >= high_th: - highn += 1 - print '>= {}% N:\t{}'.format(high_th, highn) diff --git a/scripts/err_sc_dist.py b/scripts/evaluation/err_sc_dist.py similarity index 100% rename from scripts/err_sc_dist.py rename to scripts/evaluation/err_sc_dist.py diff --git a/scripts/evaluate.py b/scripts/evaluation/evaluate.py similarity index 100% rename from scripts/evaluate.py rename to scripts/evaluation/evaluate.py diff --git a/scripts/evaluate2.py b/scripts/evaluation/evaluate2.py similarity index 100% rename from scripts/evaluate2.py rename to scripts/evaluation/evaluate2.py diff --git a/scripts/filter_inversion.py b/scripts/filter_inversion.py deleted file mode 100644 index b2cc130..0000000 --- a/scripts/filter_inversion.py +++ /dev/null @@ -1,75 +0,0 @@ -import sys -import re - -def get_len(cigar): - mysum = 0 - ind = 0 - lens = re.split("M|D|N|=|X|I|S|H|P", cigar) - if lens[-1] == '': - del lens[-1] - - #print cigar - #print lens - for x in lens: - ind += len(x) - if cigar[ind] in 'MDN=X': - mysum += int(x) - ind += 1 - return mysum - -chimeric_sam = sys.argv[1] # sorted -out_file = sys.argv[2] - -inv_rname = open(out_file + '.rname', 'w') -inv_sam = open(out_file + '.inv.sam', 'w') -bsj_sam = open(out_file + '.bsj.sam', 'w') - -with open(chimeric_sam) as sf: - mates = { } - for l in sf: - if l[0] == '@': - continue - - ll = l.strip().split() - rname = ll[0] - flag = int(ll[1]) - chrom = ll[2] - spos = int(ll[3]) - cigar = ll[5] - seq = ll[9] - - rlen = get_len(cigar) - #print rlen - - if rname not in mates.keys(): - mates[rname] = [ (flag, chrom, seq, l.strip()) ] - else: - mates[rname].append( (flag, chrom, seq, l.strip()) ) - -for rname in mates.keys(): - primaries = [] - for x in mates[rname]: - if x[0] < 256: - primaries.append(x) - - inver = True - for x in mates[rname]: - if x[0] < 256: - continue - - for pri in primaries: - if x[2] == pri[2]: - inver = False - - if inver: - print 'INV', rname - print >> inv_rname, rname - for y in mates[rname]: - print >> inv_sam, y[3] - - else: - print 'ORD', rname - print >> inv_rname, rname - for y in mates[rname]: - print >> inv_sam, y[3] - diff --git a/scripts/filter_nonexon.py b/scripts/filter_nonexon.py deleted file mode 100644 index e7a2d62..0000000 --- a/scripts/filter_nonexon.py +++ /dev/null @@ -1,111 +0,0 @@ -import sys -import re - -th = 9 - -def intersect(chrom, beg, end, echrom, ebeg, eend): - if chrom != echrom: - if beg < ebeg: - return 1 - else: - return -1 - if end < ebeg: - return -1 - if beg > eend: - return 1 - - if beg < ebeg: - if ebeg - beg < th: - return 0 - else: - return -1 - if end > eend: - if end - eend < th: - return 0 - else: - return 1 - return 0 - -def check_record(chrom, beg, end, last_chrom, last_beg, last_end, gtff): - int_res = intersect(chrom, beg, end, last_chrom, last_beg, last_end) - #print 'LAST check:', int_res, chrom, beg, end, last_chrom, last_beg, last_end - if int_res == -1: - return False, (last_chrom, last_beg, last_end) - - if int_res == 0: - return True, (last_chrom, last_beg, last_end) - - for l in gtff: - if l[0] == '#': - continue - ll = l.strip().split() - if ll[2] != 'exon': - continue - - echrom = ll[0] - ebeg = int(ll[3]) - eend = int(ll[4]) - - int_res = intersect(chrom, beg, end, echrom, ebeg, eend) - #print 'XXXX:', int_res, chrom, beg, end, echrom, ebeg, eend - - if int_res == -1: - return False, (echrom, ebeg, eend) - - if int_res == 0: - return True, (echrom, ebeg, eend) - - return False, ('-1', -1, -1) - -def get_len(cigar): - mysum = 0 - ind = 0 - lens = re.split("M|D|N|=|X|I|S|H|P", cigar) - if lens[-1] == '': - del lens[-1] - - #print cigar - #print lens - for x in lens: - ind += len(x) - if cigar[ind] in 'MDN=X': - mysum += int(x) - ind += 1 - return mysum - -chimeric_sam = sys.argv[1] -gtf_file = sys.argv[2] -out_file = sys.argv[3] - -nonexon_rname = open(out_file + '.rname', 'w') # sorted -nonexon_sam = open(out_file + '.sam', 'w') # sorted - -with open (gtf_file) as gtff: - with open(chimeric_sam) as sf: - last_chrom = '0' - last_beg = 1000000000 - last_end = 1000000001 - for l in sf: - if l[0] == '@': - continue - - ll = l.strip().split() - rname = ll[0] - chrom = ll[2] - spos = int(ll[3]) - cigar = ll[5] - rlen = get_len(cigar) - #print rlen - - res, (ch, b, e) = check_record(chrom, spos, spos + rlen - 1, last_chrom, last_beg, last_end, gtff) - if not res: - print >> nonexon_rname, rname - print 'NO', rname - print >> nonexon_sam, l.strip() - - else: - print 'YES', rname - - last_chrom = ch - last_beg = b - last_end = e diff --git a/scripts/gene_model.py b/scripts/gene_model.py deleted file mode 100755 index 79afda1..0000000 --- a/scripts/gene_model.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python -#-*- encoding:utf-8 -*- -# - - -import sys, os, math -from operator import itemgetter - -class Usage(Exception): - def __init__(self, msg): - self.msg = msg - -def usage(): - print '\nUsage: python ' + sys.argv[0] + ' GTF' - sys.exit(-1) - -#################### -def overlap_interval( s1, e1, s2, e2): - ol = 0 - if ( s1 <= s2 and e2 <= s2 ): - ol = e2 - s2 + 1 - elif ( s2 <= s1 and e1 <= e2 ): - ol = e1 - s1 + 1 - elif ( s2 <= s1 and s1 <= e2 ): - ol = e2 - s1 + 1 - elif ( s1 <= s2 and s2 <= e1 ): - ol = e1 - s2 + 1 - return ol - -########## Load GTF with versioon >=75 ########## -##### -# chr -> list of sorted genes: start, end, list_of_isoforms -# an isoform is a list of exons -def load_GTF( gtf_file ): - gene_dict = {} # from chromosome to a list of sorted gene record - gene_list = [] # sorted exon record for a gene - - # tmp struc for exon for a gene - coor_dict = {} - coor_list = [] - iso_list = [] # for promotor - - gene_name = "" - ref = "" - start = 0 - end = 0 - - sr = open( gtf_file, 'r' ) - for line in sr: - if ("#" == line[0]): - continue - z_list = line.split("\t") - # ignoring all pacthed records - if ( 2 < len(z_list[0])): - continue - - if ( ref != z_list[0] ): # storing the list of all genes into a ref entry - if 0 < len(coor_list): - iso_list.append( [s, e, coor_list ]) - if 0 < len(iso_list): - iso_list.sort( key = itemgetter(0,1) ) - gene_list.append( [ ref, start, end, gene_name, iso_list ]) - if ( 0 < len(gene_list) ): - gene_list.sort( key=itemgetter(1,2) ) - gene_dict[ ref ] = gene_list - print "Loading Chromosome " + ref + " with " + str(len(gene_list)) + " genes" - - gene_list = [] - iso_list = [] - coor_list = [] - start = 0 - end = 0 - ref = z_list[0] - print "Starting Chromosome " +ref - - if ("gene" == z_list[2]): - if 0 < len(coor_list): - iso_list.append( [ s, e, coor_list ] ) - if 0 < len(iso_list): - iso_list.sort( key = itemgetter(0,1) ) - gene_list.append( [ ref, start, end, gene_name, iso_list ]) - coor_list = [] - iso_list = [] - coor_dict = {} - - gene_name = line.split("\"")[5] - start = int(z_list[3]) - end = int(z_list[4]) - - if ("transcript" == z_list[2]): - if 0 < len(coor_list): - iso_list.append( [ s, e, coor_list ] ) - s = int(z_list[3]) - e = int(z_list[4]) - coor_list = [] - elif ("exon" == z_list[2]): - coor_list.append( [ int(z_list[3]), int(z_list[4]) ] ) - - sr.close() - - # Last exon - if 0 < len(coor_list): - iso_list.append( [ s, e, coor_list ] ) - if 0 < len(iso_list): - iso_list.sort( key = itemgetter(0,1) ) - #iso_list.sort(key = itemgetter(1,2) ) - gene_list.append( [ ref, start, end, gene_name, iso_list ]) - if ( 0 < len(gene_list) ): - gene_list.sort( key=itemgetter(1,2) ) - gene_dict[ ref ] = gene_list - print "Loading Chromosome " + ref + " with " + str(len(gene_list)) + " genes" - - return gene_dict -######################### -def main(): - args = sys.argv[1:] - if len(args) != 1: - usage() - - gene_model = load_GTF( sys.argv[1] ) - -######################### -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/gmc.py b/scripts/gmc.py deleted file mode 100644 index df31ff1..0000000 --- a/scripts/gmc.py +++ /dev/null @@ -1,88 +0,0 @@ -import sys - -gtf_fname = sys.argv[1] -junc_fname = sys.argv[2] - -th = 5 -exons = [ ] -starts = { } -ends = { } - -def valid_start_bsj(exons, chrom, start, end): - for exon in exons: - if chrom != exon[0]: - continue - if abs(exon[1] - start) <= th: - return True - return False - -def valid_end_bsj(exons, chrom, start, end): - for exon in exons: - if chrom != exon[0]: - continue - if abs(exon[2] - end) <= th: - return True - return False - -def find_target(mlist, beg, end, target): - if end - beg <= 0: - return False - mid = (beg + end) / 2 - if target > mlist[mid] + th: - return find_target(mlist, mid+1, end, target) - if target < mlist[mid] - th: - return find_target(mlist, beg, mid, target) - - # abs(mlist[mid] - target) < th - return True - -with open(gtf_fname) as gf: - for l in gf: - if l[0] == '#': - continue - - l = l.strip().split() - type = l[2] - if type != 'exon': - continue - - chrom = l[0] - if chrom not in starts.keys(): - starts[chrom] = [] - if chrom not in ends.keys(): - ends[chrom] = [] - - start = int(l[3]) - end = int(l[4]) - exons.append((chrom, start, end)) - starts[chrom].append(start) - ends[chrom].append(end) - -#print exons -for ch in starts.keys(): - starts[ch].sort() - ends[ch].sort() - -with open(junc_fname) as jf: - line = 0 - for ll in jf: - if line % 1000 == 0: - print >>sys.stderr, line - line += 1 - ll = ll.strip() - l = ll.split() - if len(l) < 9: - continue - chrom = l[1] - start = int(l[2]) - end = int(l[3]) - if not find_target(ends[chrom], 0, len(ends[chrom]), end): - continue - - chrom = l[5] - start = int(l[6]) - end = int(l[7]) - #if valid_start_bsj(exons, chrom, start, end): - if find_target(starts[chrom], 0, len(starts[chrom]), start): - print ll - diff --git a/scripts/gtf_mod.py b/scripts/gtf_mod.py deleted file mode 100644 index 18c38b9..0000000 --- a/scripts/gtf_mod.py +++ /dev/null @@ -1,64 +0,0 @@ -import re -import sys - -fname = sys.argv[1] -lens = [0, - 0, #chr 1 - 81195210 + 78077248 + 59128983 + 3, - 81195210 + 78077248 + 59128983 + 243199373 + 63025520 + 48129895 + 51304566 + 7, - 81195210 + 78077248 + 59128983 + 243199373 + 63025520 + 48129895 + 51304566 + 198022430 + 8, - 0, - 180915260 + 1, - 180915260 + 171115067 + 2, - 180915260 + 171115067 + 159138663 + 3, - 180915260 + 171115067 + 159138663 + 146364022 + 4, - 249250621 + 1, - 249250621 + 135534747 + 2, - 249250621 + 135534747 + 135006516 + 3, - 249250621 + 135534747 + 135006516 + 133851895 + 4, - 249250621 + 135534747 + 135006516 + 133851895 + 115169878 + 5, - 249250621 + 135534747 + 135006516 + 133851895 + 115169878 + 107349540 + 6, - 249250621 + 135534747 + 135006516 + 133851895 + 115169878 + 107349540 + 102531392 + 7, - 0, - 81195210 + 1, - 81195210 + 78077248 + 2, - 81195210 + 78077248 + 59128983 + 243199373 + 4, - 81195210 + 78077248 + 59128983 + 243199373 + 63025520 + 5, - 81195210 + 78077248 + 59128983 + 243199373 + 63025520 + 48129895 + 6, - 180915260 + 171115067 + 159138663 + 146364022 + 141213431 + 5, #MT - 180915260 + 171115067 + 159138663 + 146364022 + 141213431 + 16569 + 6, #X - 180915260 + 171115067 + 159138663 + 146364022 + 141213431 + 16569 + 155270560 + 7 #Y - ] - -ch2int = { str(x): x for x in range(1, 23) } -ch2int["MT"] = 23 -ch2int["X"] = 24 -ch2int["Y"] = 25 -#print ch2int - -with open(fname, "r") as f: - for l in f: - l = l.strip(); - if l[0] == "#": - print l - continue - - ll = re.split("\t", l) - - if ll[0] not in ch2int.keys(): - print l - continue - ch = ch2int[ll[0]] - - st = int(ll[3]) - end = int(ll[4]) - new_contig = "" - if ch in [1, 10, 11, 12, 13, 14, 15, 16]: - new_contig = "1" - elif ch in [2, 3, 4, 17, 18, 19, 20, 21, 22]: - new_contig = "2" - else: - new_contig = "3" - - print "\t".join( [ new_contig, ll[1], ll[2], str(st + lens[ch]), str(end + lens[ch]) ] + ll[5:] ) - diff --git a/scripts/make_bed.py b/scripts/make_bed.py deleted file mode 100644 index a9daa2b..0000000 --- a/scripts/make_bed.py +++ /dev/null @@ -1,38 +0,0 @@ -import sys -import re - -def get_len(cigar): - mysum = 0 - ind = 0 - lens = re.split("M|D|N|=|X|I|S|H|P", cigar) - if lens[-1] == '': - del lens[-1] - - #print cigar - #print lens - for x in lens: - ind += len(x) - if cigar[ind] in 'MDN=X': - mysum += int(x) - ind += 1 - return mysum - -chimeric_sam = sys.argv[1] -bed_out = sys.argv[2] - -bedf = open(bed_out, 'w') - -with open(chimeric_sam) as sf: - for l in sf: - if l[0] == '@': - continue - - ll = l.strip().split() - rname = ll[0] - chrom = ll[2] - spos = int(ll[3]) - cigar = ll[5] - rlen = get_len(cigar) - #print rlen - - print >>bedf, '{}\t{}\t{}\t{}'.format(chrom, spos-2, spos+rlen, rname)