Merge pull request #10 from Runsheng/dev

Dev
Runsheng · Nov 2, 2022 · 88b7123 · 88b7123
2 parents cb5a0be + 3a77678
commit 88b7123
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 59 deletions.
diff --git a/Readme.md b/Readme.md
@@ -1,7 +1,7 @@
 # TrackCluster
 ![PyPI](https://img.shields.io/pypi/v/trackcluster?color=green)
 
-Trackcluster is an isoform calling and quantification pipeline for Nanopore direct-RNA long reads.
+Trackcluster is an isoform calling and quantification pipeline for long RNA/cDNA reads.
 
 ## Table of Contents
 
@@ -14,16 +14,20 @@ Trackcluster is an isoform calling and quantification pipeline for Nanopore dire
 Hint: the ongoing development can be found in the ["dev"](https://github.com/Runsheng/trackcluster/tree/dev) branch.
 
 ## <a name="overview"></a>Overview
-A pipeline for reference-based identification of isoform calling using Nanopore direct-RNA long reads. This pipeline was designed to use **only** long and nosisy reads to make a valid transcriptome. An indicator for the intact 5' could be very helpful to the pipeline, i.e, the splicing leader in the mRNA of nematodes. 
-
-It is recommended to combine all samples together to generate a new transcriptome reference. After this process, the expression of isoforms in each sample can be fetched by providing an "name:sample" table. 
+A pipeline for reference-based isoform identification and quantification using long reads. This pipeline was designed to use **only** long and nosisy reads to make a valid transcriptome. An indicator for the intact 5' could be very helpful to the pipeline, i.e, the splicing leader in the mRNA of nematodes. 
 
 The major input/output for this pipeline is "bigg"--["bigGenePred"](https://github.com/Runsheng/trackcluster/blob/master/test/bigGenePred.as) format. 
 
 ## <a name="requirements"></a>Requirements
 
 1. developed on python 3.9, tested on python 3.6 and above (or 2.7.10+), should work with most of the py3 versions
 2. samtools V2.0+ , bedtools V2.24+  and minimap2 V2.24+ in your $PATH
+```bash
+# install the external bins with conda
+conda install -c bioconda samtools
+conda install -c bioconda bedtools
+conda install -c bioconda minimap2
+```
 
 ## Installation
 ```bash
@@ -83,6 +87,6 @@ trackrun.py desc --isoform reads_gene.bed --reference ref.bed # will generated r
 
 
 ## Citation
-Please kindly cite our paper in Genome Research if you use trackcluster in your work.
+Please kindly cite our paper for using trackcluster in your work.
 
 Li, R., Ren, X., Ding, Q., Bi, Y., Xie, D. and Zhao, Z., 2020. Direct full-length RNA sequencing reveals unexpected transcriptome complexity during *Caenorhabditis elegans* development. **Genome research**, 30(2), pp.287-298.
diff --git a/script/trackrun.py b/script/trackrun.py
@@ -22,7 +22,7 @@
 from trackcluster.utils import is_bin_in, is_package_installed, get_file_prefix
 from trackcluster import __version__
 from trackcluster.flow import flow_clusterj_all_gene_novel, flow_cluster_all_gene_novel, \
-    flow_count, flow_desc_annotation, flow_add_gene
+    flow_count, flow_desc_annotation, flow_add_gene, flow_mapping
 
 logger = logging.getLogger('summary')
 logger.setLevel(logging.INFO)
@@ -61,6 +61,37 @@ def __init__(self):
             exit(1)
         getattr(self, args.command)()
 
+    def map(self):
+        parser = argparse.ArgumentParser(
+            description="minimap2 mapping/samtools process wrapper, will generate prefix_s.bam"
+                        "trackrun.py -t 32 -g gemone.fa -f sample.fastq"
+        )
+        parser.add_argument("-d", "--folder", default=os.getcwd(),
+                            help="the folder contains all the seperated tracks in different locus/genes, default is the current dir")
+
+        parser.add_argument("-f", "--fastq", help="the fastq file ")
+        parser.add_argument("-g", "--genome", help="reference genome file")
+        parser.add_argument("-t", "--thread", default=32, type=int,
+                            help="the max thread used to run some of the process")
+        parser.add_argument("-p", "--prefix", default=None,
+                            help="prefix of output file, default is the prefix from --fastq")
+
+
+        arg_use = sys.argv[2:]
+        if len(arg_use) >= 4:
+            args = parser.parse_args(arg_use)
+        else:
+            parser.print_help()
+            sys.exit(1)
+        if args.prefix is None:
+            args.prefix=get_file_prefix(args.fastq,sep=".")
+
+        flow_mapping(wkdir=args.folder,
+                     ref_file=args.genome,
+                     fastq_file=args.fastq,
+                     prefix=args.prefix,
+                     core=args.thread)
+
 
     def addgene(self):
         parser = argparse.ArgumentParser(
@@ -226,7 +257,8 @@ def count(self):
                    prefix=args.prefix,
                    nano_bed=args.sample,
                    isoform_bed=args.isoform,
-                   gff_bed=args.reference)
+                   gff_bed=args.reference,
+                   )
 
     def desc(self):
         parser = argparse.ArgumentParser(
@@ -243,6 +275,8 @@ def desc(self):
                                  "be treated as the same.")
         parser.add_argument("-p", "--prefix", default=None,
                             help="prefix of output file, default is the prefix from --isoform")
+        parser.add_argument("-t", "--thread", default=10, type=int,
+                            help="the max thread used to run some of the process")
 
         arg_use=sys.argv[2:]
         if len(arg_use)>=4:
@@ -258,7 +292,8 @@ def desc(self):
                              isoform_bed=args.isoform,
                              gff_bed=args.reference,
                              offset=args.offset,
-                             prefix=args.prefix)
+                             prefix=args.prefix,
+                             core=args.thread)
 
     def test(self):
         """

diff --git a/test/bigGenePred.as b/test/bigGenePred.as
@@ -1,25 +1,25 @@
 table bigGenePred
 "bigGenePred gene models"
    (
-   string chrom;       "Reference sequence chromosome or scaffold"
-   uint   chromStart;  "Start position in chromosome"
-   uint   chromEnd;    "End position in chromosome"
-   string name;        "Name or ID of item, ideally both human readable and unique"
-   uint score;         "Score (0-1000)"
-   char[1] strand;     "+ or - for strand"
-   uint thickStart;    "Start of where display should be thick (start codon)"
-   uint thickEnd;      "End of where display should be thick (stop codon)"
-   uint reserved;       "RGB value (use R,G,B string in input file)"
-   int blockCount;     "Number of blocks"
-   int[blockCount] blockSizes; "Comma separated list of block sizes"
-   int[blockCount] chromStarts; "Start positions relative to chromStart"
-   string name2;       "Alternative/human readable name"
-   string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)"
-   string cdsEndStat;   "Status of CDS end annotation (none, unknown, incomplete, or complete)"
-   int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon"
-   string type;        "Transcript type"
-   string geneName;    "Primary identifier for gene"
-   string geneName2;   "Alternative/human readable gene name"
-   string geneType;    "Gene type"
+    string chrom; "Reference sequence chromosome or scaffold"
+    uint chromStart; "Start position in chromosome"
+    uint chromEnd; "End position in chromosome"
+    string name; "Name or ID of item, ideally both human readable and unique"
+    uint score; "Score (0-1000)"
+    char[1] strand; "+ or - for strand"
+    uint thickStart; "Start of where display should be thick (start codon)"
+    uint thickEnd; "End of where display should be thick (stop codon)"
+    uint reserved; "RGB value (use R,G,B string in input file)"
+    int blockCount; "Number of blocks"
+    int[blockCount] blockSizes; "Comma separated list of block sizes"
+    int[blockCount] chromStarts; "Start positions relative to chromStart"
+    string name2; "Alternative/human readable name, |used to store subreads or coverage"
+    string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)"
+    string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)"
+    int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon"
+    string type; "Transcript type"
+    string geneName; "Primary identifier for gene"
+    string geneName2; "Alternative/human readable gene name, |used to store sample information such as stage or group"
+    string geneType; "Gene type"
    )
 
diff --git a/trackcluster/flow.py b/trackcluster/flow.py
@@ -36,12 +36,12 @@ def flow_mapping(wkdir,ref_file,fastq_file,prefix, core=16):
 
     os.chdir(wkdir)
 
-    cmd_map="minimap2 -ax splice -k14 -uf -t {core} {ref} {fastq_file} | samtools view -bS -F260 -q 30 > {prefix}.bam".format(
+    cmd_map="minimap2 -ax splice -k14 -uf -t {core} {ref} {fastq_file} | samtools view -bS -F260 -F2048 -q 30 > {prefix}.bam".format(
     prefix = prefix, core = core, fastq_file = fastq_file, ref = ref_file)
     print(cmd_map)
     myexe(cmd_map)
 
-    cmd_sam2="samtools sort -/@{core} {prefix}.bam >{prefix}_s.bam".format(prefix=prefix, core=core)
+    cmd_sam2="samtools sort -@{core} {prefix}.bam >{prefix}_s.bam".format(prefix=prefix, core=core)
     print(cmd_sam2)
     myexe(cmd_sam2)
 
@@ -237,8 +237,8 @@ def flow_count(wkdir, prefix, nano_bed, isoform_bed, gff_bed):
             line_count=[x/gsum*coverage for x in line_count]
         # the mutiple name part
         genename_l=bigg.geneName.split("||")
-        line_prefix=deque([bigg.name, coverage])
         for genename in genename_l:
+            line_prefix = deque([bigg.name, coverage])
             line_prefix.appendleft(genename)
             line_prefix.extend(line_count)
             expression_list.append(line_prefix)
@@ -303,6 +303,8 @@ def flow_clusterj_all_gene_novel(wkdir, prefix,nano_bed, gff_bed, core=30,
     bigg_isoform_file=prefix+"_isoform.bed"
     bigg_isoform_cov5_file=prefix+"_cov{}_isoform.bed".format(count_cutoff)
 
+    bigg_isoform_novelgene_file=prefix+"_isoform_novel.bed"
+
     os.chdir(wkdir)
 
     # step1
@@ -345,6 +347,12 @@ def flow_clusterj_all_gene_novel(wkdir, prefix,nano_bed, gff_bed, core=30,
     flow_preparedir(wkdir, prefix, bigg_regionmark_f, novel_bed, genename_file=novelname_file)  # write genename_file
     flow_key_clusterj(wkdir, novelname_file, core=core, batchsize=batchsize)
 
+    # glue the novel part and write the file down
+    bigg_nisoform=cat_bed("NOVEL*/*_simple_coveragej.bed") # use ** for all file in the wkdir
+    write_bigg(bigg_nisoform,bigg_isoform_novelgene_file)
+
+
+
     return 1
 ########################################################################################################
 
@@ -450,7 +458,7 @@ def flow_cluster_all_gene_novel(wkdir, prefix,nano_bed, gff_bed, core=30,f1=0.01
 ########################################################################################################
 ########################################################################################################
 
-def flow_desc_annotation(wkdir,isoform_bed, gff_bed, offset=10, prefix=None):
+def flow_desc_annotation(wkdir,isoform_bed, gff_bed, offset=10, prefix=None, core=10):
     """
     generate all files needed to draw the desc figure for isoform classcification
     :param wkdir:
@@ -475,19 +483,35 @@ def flow_desc_annotation(wkdir,isoform_bed, gff_bed, offset=10, prefix=None):
     # class4: [bigg0.name, class4]
     class4 = []
     desc = []
-    for k in gene2isoform.keys():
-        #print(k)
+
+    # add muti core processing for this part
+    keys=list(gene2isoform.keys())
+
+    def process_class4(k):
         nanos = gene2isoform[k]
         refs = gene2ref[k]
-
         for bigg in nanos:
             try:
                 out_class4 = flow_class4(bigg, refs, offset=offset)
-                class4.append(out_class4)
+                return out_class4
+            except IndexError:
+                return None
+    def process_desc(k):
+        nanos = gene2isoform[k]
+        refs = gene2ref[k]
+        for bigg in nanos:
+            try:
                 out_desc = flow_desc(bigg, refs, offset=offset)
-                desc.append(out_desc)
+                return out_desc
             except IndexError:
-                pass
+                return None
+    print("Running class4")
+    class4=parmap(process_class4, tqdm(keys), nprocs=core)
+    class4=[x for x in class4 if x is not None]
+    print("Running desc")
+    desc=parmap(process_desc, tqdm(keys), nprocs=core)
+    desc=[x for x in desc if x is not None]
+
     # IO part
     list2file(desc, filename=prefix+"_desc.txt", sep="\t")
     list2file(class4, filename=prefix+"_class4.txt", sep="\t")
@@ -565,26 +589,7 @@ def flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix=N
             fw.write(bigg.name + "\t" + str(bigg.exonlen) + "\t" + bigg.geneName + "\n")
 
     ####### fusion part
-    fusion_d = {}
-    used = set()
-
-    for bigg in read:
-        try:
-            fusion_d[bigg.name].append(bigg.geneName)
-        except KeyError:
-            fusion_d[bigg.name] = []
-            fusion_d[bigg.name].append(bigg.geneName)
-
-    fusion_d_real = {}
-    for k, v in fusion_d.items():
-        v_s = list(set(v))
-        if len(v_s) > 1:
-            fusion_d_real[k] = v
 
-    with open(prefix+"_fusion.txt", "w") as fw:
-        for k, v in fusion_d_real.items():
-            v_str = ";".join(v)
-            fw.write(k + "\t" + v_str + "\n")
 
 ############
 def flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30,

diff --git a/trackcluster/pre.py b/trackcluster/pre.py
@@ -61,7 +61,9 @@ def wrapper_bedtools_merge(bigg_file, out):
     """
 
     # merge with strandness, report to col5
-    cmd="bedtools merge -nonamecheck -s  -c 4 -o count -i {biggfile} > {out}".format(
+    # bedtools 2.26, can use -s -c 4 -o count
+    # bedtools 2.30, must use -s c 6 -o distinct,count
+    cmd="bedtools merge -nonamecheck -s  -c 6 -o distinct,count -i {biggfile} > {out}".format(
         biggfile=bigg_file, out=out
     )
     _=myexe(cmd)
@@ -118,7 +120,11 @@ def mergedbed2bigg(merge_bedfile, count_cutoff=5):
     with open(merge_bedfile, "r") as f:
         for line in f.readlines():
             line_l=line.strip().split("\t")
-            chro, start, end, strand, count=line_l
+            # to deal with the bedtools merge version unmatch bug
+            # before 2.26, result for -s -c 6 -o distinct, count return 6 col chro, start, end, strand, strand, count
+            # after 2.30, return 5 col, chro, start, end, strand, count
+            chro, start, end, strand =line_l[0:4]
+            count=line_l[-1]
 
             if int(count)>=count_cutoff:
                 # generate a chro_start_end string for name