diff --git a/script/bigg2len.py b/script/bigg2len.py new file mode 100644 index 0000000..18064c7 --- /dev/null +++ b/script/bigg2len.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import argparse +import os,sys,inspect + +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(os.path.dirname(currentdir)) +sys.path.insert(0,parentdir) + +from trackcluster.tracklist import read_bigg, write_bigg + +parser=argparse.ArgumentParser() +parser.add_argument("-b", "--biggfile", + help="the bigg bed file") +parser.add_argument("-o", "--out", default="mapped_len.txt", + help="the output file name") + +args = parser.parse_args(args=None if sys.argv[1:] else ['--help']) + +# make a file using the functions +outfile=args.out + +bigg_l=read_bigg(args.biggfile) +with open (outfile, "w") as fw: + for bigg in bigg_l: + bigg.get_exon() + fw.write(bigg.name+"\t"+str(bigg.exonlen)+"\t"+bigg.geneName+"\t"+bigg.ttype+"\n") + + diff --git a/script/trackrun.py b/script/trackrun.py index 6014885..1cf6bf7 100644 --- a/script/trackrun.py +++ b/script/trackrun.py @@ -315,7 +315,7 @@ def test(self): if args.install: ## test installed packages, samtools, bedtools if is_bin_in("samtools") and is_bin_in("bedtools") and is_bin_in("minimap2"): - logger.info("Pass") + logger.info("samtools, bedtools, minimap2, Pass") else: logger.info("Check if samtools, bedtools and minimap2 are in $PATH") diff --git a/setup.py b/setup.py index 24167e1..9e2eec7 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ 'script/bam2bigg.py', 'script/bigg2b.py', 'script/gff2bigg.py', - 'script/biggmutant.py'], + 'script/biggmutant.py', + 'script/bigg2len.py'], ) diff --git a/test/flow_test.py b/test/flow_test.py index 575e237..eda32d5 100644 --- a/test/flow_test.py +++ b/test/flow_test.py @@ -117,7 +117,7 @@ def test_prepare_run_gene_novel(self): :return: """ # parameters - wkdir="/t1/shoudong_488/test/tracktest" + wkdir="/t1/shoudong_488/test/trackall" prefix="488_aba_1" gff_bed="../gene.bed_s" nano_bed="../488_aba_1_s.bed" diff --git a/test/utils_test.py b/test/utils_test.py index 58973f0..316ed71 100644 --- a/test/utils_test.py +++ b/test/utils_test.py @@ -38,6 +38,7 @@ def test_is_bin_in(self): def test_is_package_installed(self): print(is_package_installed("Bio")) + print(is_package_installed("os")) def test_summary(self): logger = log_summary() diff --git a/trackcluster/__init__.py b/trackcluster/__init__.py index 8cb2778..c9142c4 100644 --- a/trackcluster/__init__.py +++ b/trackcluster/__init__.py @@ -1,5 +1,5 @@ # For relative imports to work in Python 3.6 import os, sys sys.path.append(os.path.dirname(os.path.realpath(__file__))) -__version__="0.1.4" +__version__="0.1.6" __all__ = [] diff --git a/trackcluster/flow.py b/trackcluster/flow.py index 82d984d..0d796e2 100644 --- a/trackcluster/flow.py +++ b/trackcluster/flow.py @@ -5,7 +5,7 @@ from trackcluster.utils import myexe, is_bin_in, get_file_prefix,del_files,parmap, name2file, file2name, list2file, file2list from trackcluster.convert import sam_to_bigGenePred from trackcluster.batch import process_one_junction_corrected_try, process_one_subsample_try -from trackcluster.tracklist import read_bigg, write_bigg, cat_bed, bigg_count_write_native, is_a_read +from trackcluster.tracklist import read_bigg, write_bigg, cat_bed, bigg_count_write_native, is_a_read, list_to_dic from trackcluster.pre import wrapper_bedtools_intersect2_select, tracklist_add_gene,get_gendic_bedinter,group_bigg_by_gene, wrapper_bedtools_merge, mergedbed2bigg, wrapper_bedtools_subtract from trackcluster.post import flow_desc, flow_class4 @@ -83,17 +83,25 @@ def flow_bamconvert(wkdir,bamfile,out,prefix,score=30): def flow_add_gene(wkdir, prefix, bigg_gff_file, bigg_nano_file, f1=0.01, f2=0.05): os.chdir(wkdir) - ### get two parts, the gene part and the novel part + + # make sure one read one track + bigg_raw=read_bigg(bigg_nano_file) + bigg_dedup=list(list_to_dic(bigg_raw).values()) + print("raw bigg number: {}; after dedup:{}".format(len(bigg_raw), len(bigg_dedup))) + outbed=prefix+"_dedup.bed" + write_bigg(bigg_dedup, outbed) + + ### get two parts, the gene part and the novel part, + # the gene part outfile = prefix + "inter.bed" # write the outfile to disk - wrapper_bedtools_intersect2_select(bigg_nano_file, bigg_gff_file, outfile=outfile, + wrapper_bedtools_intersect2_select(outbed, bigg_gff_file, outfile=outfile, fraction_bed1=f1, fraction_bed2=f2) read_gene = get_gendic_bedinter(outfile) print("read number in genes:", len(read_gene)) - bigg_nano = read_bigg(bigg_nano_file) - bigg_new = tracklist_add_gene(bigg_nano, read_gene) + bigg_new = tracklist_add_gene(bigg_dedup, read_gene) # cleanup del_files([outfile]) diff --git a/trackcluster/pre.py b/trackcluster/pre.py index 2f86419..e06acec 100644 --- a/trackcluster/pre.py +++ b/trackcluster/pre.py @@ -63,10 +63,10 @@ def wrapper_bedtools_merge(bigg_file, out): # merge with strandness, report to col5 # bedtools 2.26, can use -s -c 4 -o count # bedtools 2.30, must use -s c 6 -o distinct,count - cmd="bedtools merge -nonamecheck -s -c 6 -o distinct,count -i {biggfile} > {out}".format( + cmd="bedtools sort -i {biggfile} > {biggfile}_s && bedtools merge -nonamecheck -s -c 6 -o distinct,count -i {biggfile}_s > {out}".format( biggfile=bigg_file, out=out ) - _=myexe(cmd) + myexe(cmd) return out @@ -207,6 +207,9 @@ def tracklist_add_gene(bigg_nano, read_gene): :param bigg_nano: read track to bed added :param read_gene: :return: bigg_nano with genName has mutiple gene1||gene2||gene3 + + The gene name is used to define the "fusion" geneļ¼Œso need to make sure one read only have one track + """ bigg_new=[] for bigg in bigg_nano: diff --git a/trackcluster/tracklist.py b/trackcluster/tracklist.py index 3146044..282a1fb 100644 --- a/trackcluster/tracklist.py +++ b/trackcluster/tracklist.py @@ -87,11 +87,31 @@ def write_bigg(bigg_list, out="bigg_new.bed"): def list_to_dic(bigg_list): + """ + remove the duplicated mapping in bigg_list + if minimap2 do not force one mapping region for one read, there could be multiple mapping region for each reads + which could make the too many fusion isoforms + + the non-dup bigg_list can be get with list(bigg_dic.values()) + used in flow_addgene + + :param bigg_list: + :return: + """ bigg_dic=OrderedDict() - for i in bigg_list: - bigg_dic[i.name]=i + for bigg in bigg_list: + bigg.get_exon() + try: + bigg_previous=bigg_dic[bigg.name] + if bigg_previous.exonlen>=bigg.exonlen: + pass + else: + bigg_dic[bigg.name]=bigg + except KeyError: + bigg_dic[bigg.name] = bigg + return bigg_dic diff --git a/trackcluster/utils.py b/trackcluster/utils.py index 15beaec..f68489e 100644 --- a/trackcluster/utils.py +++ b/trackcluster/utils.py @@ -102,13 +102,20 @@ def is_bin_in(cmd_name): return False -def is_package_installed(package_name): - try: - import package_name - except ImportError: - return False - return True +def is_package_installed(name): + import importlib.util + import sys + if name in sys.modules: + return True + elif (spec := importlib.util.find_spec(name)) is not None: + # If you choose to perform the actual import ... + module = importlib.util.module_from_spec(spec) + sys.modules[name] = module + spec.loader.exec_module(module) + return True + else: + return False def set_tmp(wkdir=None): """