Merge pull request #11 from Runsheng/dev

Dev
Runsheng · Jan 25, 2023 · 514e634 · 514e634
2 parents 88b7123 + 6025fa3
commit 514e634
Show file tree

Hide file tree

Showing 10 changed files with 88 additions and 19 deletions.
diff --git a/script/bigg2len.py b/script/bigg2len.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import os,sys,inspect
+
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(os.path.dirname(currentdir))
+sys.path.insert(0,parentdir)
+
+from trackcluster.tracklist import read_bigg, write_bigg
+
+parser=argparse.ArgumentParser()
+parser.add_argument("-b", "--biggfile",
+                    help="the bigg bed file")
+parser.add_argument("-o", "--out", default="mapped_len.txt",
+                    help="the output file name")
+
+args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
+
+# make a file using the functions
+outfile=args.out
+
+bigg_l=read_bigg(args.biggfile)
+with open (outfile, "w") as fw:
+    for bigg in bigg_l:
+        bigg.get_exon()
+        fw.write(bigg.name+"\t"+str(bigg.exonlen)+"\t"+bigg.geneName+"\t"+bigg.ttype+"\n")
+
+
diff --git a/script/trackrun.py b/script/trackrun.py
@@ -315,7 +315,7 @@ def test(self):
 
         if args.install: ## test installed packages, samtools, bedtools
             if is_bin_in("samtools") and is_bin_in("bedtools") and is_bin_in("minimap2"):
-                logger.info("Pass")
+                logger.info("samtools, bedtools, minimap2, Pass")
             else:
                 logger.info("Check if samtools, bedtools and minimap2 are in $PATH")
 

diff --git a/setup.py b/setup.py
@@ -22,6 +22,7 @@
              'script/bam2bigg.py',
              'script/bigg2b.py',
              'script/gff2bigg.py',
-             'script/biggmutant.py'],
+             'script/biggmutant.py',
+             'script/bigg2len.py'],
 
 )
diff --git a/test/flow_test.py b/test/flow_test.py
@@ -117,7 +117,7 @@ def test_prepare_run_gene_novel(self):
         :return:
         """
         # parameters
-        wkdir="/t1/shoudong_488/test/tracktest"
+        wkdir="/t1/shoudong_488/test/trackall"
         prefix="488_aba_1"
         gff_bed="../gene.bed_s"
         nano_bed="../488_aba_1_s.bed"

diff --git a/test/utils_test.py b/test/utils_test.py
@@ -38,6 +38,7 @@ def test_is_bin_in(self):
 
     def test_is_package_installed(self):
         print(is_package_installed("Bio"))
+        print(is_package_installed("os"))
 
     def test_summary(self):
         logger = log_summary()

diff --git a/trackcluster/__init__.py b/trackcluster/__init__.py
@@ -1,5 +1,5 @@
 # For relative imports to work in Python 3.6
 import os, sys
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-__version__="0.1.4"
+__version__="0.1.6"
 __all__ = []
diff --git a/trackcluster/flow.py b/trackcluster/flow.py
@@ -5,7 +5,7 @@
 from trackcluster.utils import myexe, is_bin_in, get_file_prefix,del_files,parmap, name2file, file2name, list2file, file2list
 from trackcluster.convert import sam_to_bigGenePred
 from trackcluster.batch import process_one_junction_corrected_try, process_one_subsample_try
-from trackcluster.tracklist import read_bigg, write_bigg, cat_bed, bigg_count_write_native, is_a_read
+from trackcluster.tracklist import read_bigg, write_bigg, cat_bed, bigg_count_write_native, is_a_read, list_to_dic
 
 from trackcluster.pre import wrapper_bedtools_intersect2_select, tracklist_add_gene,get_gendic_bedinter,group_bigg_by_gene, wrapper_bedtools_merge, mergedbed2bigg, wrapper_bedtools_subtract
 from trackcluster.post import flow_desc, flow_class4
@@ -83,17 +83,25 @@ def flow_bamconvert(wkdir,bamfile,out,prefix,score=30):
 def flow_add_gene(wkdir, prefix, bigg_gff_file, bigg_nano_file, f1=0.01, f2=0.05):
     os.chdir(wkdir)
 
-    ### get two parts, the gene part and the novel part
+
+    # make sure one read one track
+    bigg_raw=read_bigg(bigg_nano_file)
+    bigg_dedup=list(list_to_dic(bigg_raw).values())
+    print("raw bigg number: {}; after dedup:{}".format(len(bigg_raw), len(bigg_dedup)))
+    outbed=prefix+"_dedup.bed"
+    write_bigg(bigg_dedup, outbed)
+
+    ### get two parts, the gene part and the novel part,
+
     # the gene part
     outfile = prefix + "inter.bed"
     # write the outfile to disk
-    wrapper_bedtools_intersect2_select(bigg_nano_file, bigg_gff_file, outfile=outfile,
+    wrapper_bedtools_intersect2_select(outbed, bigg_gff_file, outfile=outfile,
                                        fraction_bed1=f1, fraction_bed2=f2)
     read_gene = get_gendic_bedinter(outfile)
     print("read number in genes:", len(read_gene))
 
-    bigg_nano = read_bigg(bigg_nano_file)
-    bigg_new = tracklist_add_gene(bigg_nano, read_gene)
+    bigg_new = tracklist_add_gene(bigg_dedup, read_gene)
 
     # cleanup
     del_files([outfile])

diff --git a/trackcluster/pre.py b/trackcluster/pre.py
@@ -63,10 +63,10 @@ def wrapper_bedtools_merge(bigg_file, out):
     # merge with strandness, report to col5
     # bedtools 2.26, can use -s -c 4 -o count
     # bedtools 2.30, must use -s c 6 -o distinct,count
-    cmd="bedtools merge -nonamecheck -s  -c 6 -o distinct,count -i {biggfile} > {out}".format(
+    cmd="bedtools sort -i {biggfile} > {biggfile}_s && bedtools merge -nonamecheck -s  -c 6 -o distinct,count -i {biggfile}_s > {out}".format(
         biggfile=bigg_file, out=out
     )
-    _=myexe(cmd)
+    myexe(cmd)
 
     return out
 
@@ -207,6 +207,9 @@ def tracklist_add_gene(bigg_nano, read_gene):
     :param bigg_nano: read track to bed added
     :param read_gene:
     :return: bigg_nano with genName has mutiple gene1||gene2||gene3
+
+    The gene name is used to define the "fusion" gene，so need to make sure one read only have one track
+
     """
     bigg_new=[]
     for bigg in bigg_nano:

diff --git a/trackcluster/tracklist.py b/trackcluster/tracklist.py
@@ -87,11 +87,31 @@ def write_bigg(bigg_list, out="bigg_new.bed"):
 
 
 def list_to_dic(bigg_list):
+    """
+    remove the duplicated mapping in bigg_list
+    if minimap2 do not force one mapping region for one read, there could be multiple mapping region for each reads
+    which could make the too many fusion isoforms
+
+    the non-dup bigg_list can be get with list(bigg_dic.values())
+    used in flow_addgene
+
+    :param bigg_list:
+    :return:
+    """
 
     bigg_dic=OrderedDict()
 
-    for i in bigg_list:
-        bigg_dic[i.name]=i
+    for bigg in bigg_list:
+        bigg.get_exon()
+        try:
+            bigg_previous=bigg_dic[bigg.name]
+            if bigg_previous.exonlen>=bigg.exonlen:
+                pass
+            else:
+                bigg_dic[bigg.name]=bigg
+        except KeyError:
+            bigg_dic[bigg.name] = bigg
+
     return bigg_dic
 
 

diff --git a/trackcluster/utils.py b/trackcluster/utils.py
@@ -102,13 +102,20 @@ def is_bin_in(cmd_name):
         return False
 
 
-def is_package_installed(package_name):
-    try:
-        import package_name
-    except ImportError:
-        return False
-    return True
+def is_package_installed(name):
+    import importlib.util
+    import sys
 
+    if name in sys.modules:
+        return True
+    elif (spec := importlib.util.find_spec(name)) is not None:
+        # If you choose to perform the actual import ...
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[name] = module
+        spec.loader.exec_module(module)
+        return True
+    else:
+        return False
 
 def set_tmp(wkdir=None):
     """