Skip to content

Commit

Permalink
Merge pull request #10 from Runsheng/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
Runsheng authored Nov 2, 2022
2 parents cb5a0be + 3a77678 commit 88b7123
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 59 deletions.
14 changes: 9 additions & 5 deletions Readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# TrackCluster
![PyPI](https://img.shields.io/pypi/v/trackcluster?color=green)

Trackcluster is an isoform calling and quantification pipeline for Nanopore direct-RNA long reads.
Trackcluster is an isoform calling and quantification pipeline for long RNA/cDNA reads.

## Table of Contents

Expand All @@ -14,16 +14,20 @@ Trackcluster is an isoform calling and quantification pipeline for Nanopore dire
Hint: the ongoing development can be found in the ["dev"](https://github.com/Runsheng/trackcluster/tree/dev) branch.

## <a name="overview"></a>Overview
A pipeline for reference-based identification of isoform calling using Nanopore direct-RNA long reads. This pipeline was designed to use **only** long and nosisy reads to make a valid transcriptome. An indicator for the intact 5' could be very helpful to the pipeline, i.e, the splicing leader in the mRNA of nematodes.

It is recommended to combine all samples together to generate a new transcriptome reference. After this process, the expression of isoforms in each sample can be fetched by providing an "name:sample" table.
A pipeline for reference-based isoform identification and quantification using long reads. This pipeline was designed to use **only** long and nosisy reads to make a valid transcriptome. An indicator for the intact 5' could be very helpful to the pipeline, i.e, the splicing leader in the mRNA of nematodes.

The major input/output for this pipeline is "bigg"--["bigGenePred"](https://github.com/Runsheng/trackcluster/blob/master/test/bigGenePred.as) format.

## <a name="requirements"></a>Requirements

1. developed on python 3.9, tested on python 3.6 and above (or 2.7.10+), should work with most of the py3 versions
2. samtools V2.0+ , bedtools V2.24+ and minimap2 V2.24+ in your $PATH
```bash
# install the external bins with conda
conda install -c bioconda samtools
conda install -c bioconda bedtools
conda install -c bioconda minimap2
```

## Installation
```bash
Expand Down Expand Up @@ -83,6 +87,6 @@ trackrun.py desc --isoform reads_gene.bed --reference ref.bed # will generated r


## Citation
Please kindly cite our paper in Genome Research if you use trackcluster in your work.
Please kindly cite our paper for using trackcluster in your work.

Li, R., Ren, X., Ding, Q., Bi, Y., Xie, D. and Zhao, Z., 2020. Direct full-length RNA sequencing reveals unexpected transcriptome complexity during *Caenorhabditis elegans* development. **Genome research**, 30(2), pp.287-298.
41 changes: 38 additions & 3 deletions script/trackrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from trackcluster.utils import is_bin_in, is_package_installed, get_file_prefix
from trackcluster import __version__
from trackcluster.flow import flow_clusterj_all_gene_novel, flow_cluster_all_gene_novel, \
flow_count, flow_desc_annotation, flow_add_gene
flow_count, flow_desc_annotation, flow_add_gene, flow_mapping

logger = logging.getLogger('summary')
logger.setLevel(logging.INFO)
Expand Down Expand Up @@ -61,6 +61,37 @@ def __init__(self):
exit(1)
getattr(self, args.command)()

def map(self):
parser = argparse.ArgumentParser(
description="minimap2 mapping/samtools process wrapper, will generate prefix_s.bam"
"trackrun.py -t 32 -g gemone.fa -f sample.fastq"
)
parser.add_argument("-d", "--folder", default=os.getcwd(),
help="the folder contains all the seperated tracks in different locus/genes, default is the current dir")

parser.add_argument("-f", "--fastq", help="the fastq file ")
parser.add_argument("-g", "--genome", help="reference genome file")
parser.add_argument("-t", "--thread", default=32, type=int,
help="the max thread used to run some of the process")
parser.add_argument("-p", "--prefix", default=None,
help="prefix of output file, default is the prefix from --fastq")


arg_use = sys.argv[2:]
if len(arg_use) >= 4:
args = parser.parse_args(arg_use)
else:
parser.print_help()
sys.exit(1)
if args.prefix is None:
args.prefix=get_file_prefix(args.fastq,sep=".")

flow_mapping(wkdir=args.folder,
ref_file=args.genome,
fastq_file=args.fastq,
prefix=args.prefix,
core=args.thread)


def addgene(self):
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -226,7 +257,8 @@ def count(self):
prefix=args.prefix,
nano_bed=args.sample,
isoform_bed=args.isoform,
gff_bed=args.reference)
gff_bed=args.reference,
)

def desc(self):
parser = argparse.ArgumentParser(
Expand All @@ -243,6 +275,8 @@ def desc(self):
"be treated as the same.")
parser.add_argument("-p", "--prefix", default=None,
help="prefix of output file, default is the prefix from --isoform")
parser.add_argument("-t", "--thread", default=10, type=int,
help="the max thread used to run some of the process")

arg_use=sys.argv[2:]
if len(arg_use)>=4:
Expand All @@ -258,7 +292,8 @@ def desc(self):
isoform_bed=args.isoform,
gff_bed=args.reference,
offset=args.offset,
prefix=args.prefix)
prefix=args.prefix,
core=args.thread)

def test(self):
"""
Expand Down
40 changes: 20 additions & 20 deletions test/bigGenePred.as
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
table bigGenePred
"bigGenePred gene models"
(
string chrom; "Reference sequence chromosome or scaffold"
uint chromStart; "Start position in chromosome"
uint chromEnd; "End position in chromosome"
string name; "Name or ID of item, ideally both human readable and unique"
uint score; "Score (0-1000)"
char[1] strand; "+ or - for strand"
uint thickStart; "Start of where display should be thick (start codon)"
uint thickEnd; "End of where display should be thick (stop codon)"
uint reserved; "RGB value (use R,G,B string in input file)"
int blockCount; "Number of blocks"
int[blockCount] blockSizes; "Comma separated list of block sizes"
int[blockCount] chromStarts; "Start positions relative to chromStart"
string name2; "Alternative/human readable name"
string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)"
string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)"
int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon"
string type; "Transcript type"
string geneName; "Primary identifier for gene"
string geneName2; "Alternative/human readable gene name"
string geneType; "Gene type"
string chrom; "Reference sequence chromosome or scaffold"
uint chromStart; "Start position in chromosome"
uint chromEnd; "End position in chromosome"
string name; "Name or ID of item, ideally both human readable and unique"
uint score; "Score (0-1000)"
char[1] strand; "+ or - for strand"
uint thickStart; "Start of where display should be thick (start codon)"
uint thickEnd; "End of where display should be thick (stop codon)"
uint reserved; "RGB value (use R,G,B string in input file)"
int blockCount; "Number of blocks"
int[blockCount] blockSizes; "Comma separated list of block sizes"
int[blockCount] chromStarts; "Start positions relative to chromStart"
string name2; "Alternative/human readable name, |used to store subreads or coverage"
string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)"
string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)"
int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon"
string type; "Transcript type"
string geneName; "Primary identifier for gene"
string geneName2; "Alternative/human readable gene name, |used to store sample information such as stage or group"
string geneType; "Gene type"
)

63 changes: 34 additions & 29 deletions trackcluster/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ def flow_mapping(wkdir,ref_file,fastq_file,prefix, core=16):

os.chdir(wkdir)

cmd_map="minimap2 -ax splice -k14 -uf -t {core} {ref} {fastq_file} | samtools view -bS -F260 -q 30 > {prefix}.bam".format(
cmd_map="minimap2 -ax splice -k14 -uf -t {core} {ref} {fastq_file} | samtools view -bS -F260 -F2048 -q 30 > {prefix}.bam".format(
prefix = prefix, core = core, fastq_file = fastq_file, ref = ref_file)
print(cmd_map)
myexe(cmd_map)

cmd_sam2="samtools sort -/@{core} {prefix}.bam >{prefix}_s.bam".format(prefix=prefix, core=core)
cmd_sam2="samtools sort -@{core} {prefix}.bam >{prefix}_s.bam".format(prefix=prefix, core=core)
print(cmd_sam2)
myexe(cmd_sam2)

Expand Down Expand Up @@ -237,8 +237,8 @@ def flow_count(wkdir, prefix, nano_bed, isoform_bed, gff_bed):
line_count=[x/gsum*coverage for x in line_count]
# the mutiple name part
genename_l=bigg.geneName.split("||")
line_prefix=deque([bigg.name, coverage])
for genename in genename_l:
line_prefix = deque([bigg.name, coverage])
line_prefix.appendleft(genename)
line_prefix.extend(line_count)
expression_list.append(line_prefix)
Expand Down Expand Up @@ -303,6 +303,8 @@ def flow_clusterj_all_gene_novel(wkdir, prefix,nano_bed, gff_bed, core=30,
bigg_isoform_file=prefix+"_isoform.bed"
bigg_isoform_cov5_file=prefix+"_cov{}_isoform.bed".format(count_cutoff)

bigg_isoform_novelgene_file=prefix+"_isoform_novel.bed"

os.chdir(wkdir)

# step1
Expand Down Expand Up @@ -345,6 +347,12 @@ def flow_clusterj_all_gene_novel(wkdir, prefix,nano_bed, gff_bed, core=30,
flow_preparedir(wkdir, prefix, bigg_regionmark_f, novel_bed, genename_file=novelname_file) # write genename_file
flow_key_clusterj(wkdir, novelname_file, core=core, batchsize=batchsize)

# glue the novel part and write the file down
bigg_nisoform=cat_bed("NOVEL*/*_simple_coveragej.bed") # use ** for all file in the wkdir
write_bigg(bigg_nisoform,bigg_isoform_novelgene_file)



return 1
########################################################################################################

Expand Down Expand Up @@ -450,7 +458,7 @@ def flow_cluster_all_gene_novel(wkdir, prefix,nano_bed, gff_bed, core=30,f1=0.01
########################################################################################################
########################################################################################################

def flow_desc_annotation(wkdir,isoform_bed, gff_bed, offset=10, prefix=None):
def flow_desc_annotation(wkdir,isoform_bed, gff_bed, offset=10, prefix=None, core=10):
"""
generate all files needed to draw the desc figure for isoform classcification
:param wkdir:
Expand All @@ -475,19 +483,35 @@ def flow_desc_annotation(wkdir,isoform_bed, gff_bed, offset=10, prefix=None):
# class4: [bigg0.name, class4]
class4 = []
desc = []
for k in gene2isoform.keys():
#print(k)

# add muti core processing for this part
keys=list(gene2isoform.keys())

def process_class4(k):
nanos = gene2isoform[k]
refs = gene2ref[k]

for bigg in nanos:
try:
out_class4 = flow_class4(bigg, refs, offset=offset)
class4.append(out_class4)
return out_class4
except IndexError:
return None
def process_desc(k):
nanos = gene2isoform[k]
refs = gene2ref[k]
for bigg in nanos:
try:
out_desc = flow_desc(bigg, refs, offset=offset)
desc.append(out_desc)
return out_desc
except IndexError:
pass
return None
print("Running class4")
class4=parmap(process_class4, tqdm(keys), nprocs=core)
class4=[x for x in class4 if x is not None]
print("Running desc")
desc=parmap(process_desc, tqdm(keys), nprocs=core)
desc=[x for x in desc if x is not None]

# IO part
list2file(desc, filename=prefix+"_desc.txt", sep="\t")
list2file(class4, filename=prefix+"_class4.txt", sep="\t")
Expand Down Expand Up @@ -565,26 +589,7 @@ def flow_files_output(bigg_read_file, bigg_isoform_file, bigg_ref_file, prefix=N
fw.write(bigg.name + "\t" + str(bigg.exonlen) + "\t" + bigg.geneName + "\n")

####### fusion part
fusion_d = {}
used = set()

for bigg in read:
try:
fusion_d[bigg.name].append(bigg.geneName)
except KeyError:
fusion_d[bigg.name] = []
fusion_d[bigg.name].append(bigg.geneName)

fusion_d_real = {}
for k, v in fusion_d.items():
v_s = list(set(v))
if len(v_s) > 1:
fusion_d_real[k] = v

with open(prefix+"_fusion.txt", "w") as fw:
for k, v in fusion_d_real.items():
v_str = ";".join(v)
fw.write(k + "\t" + v_str + "\n")

############
def flow_map_convert_clusterj_count(wkdir, prefix, ref_fasta, fastq_l, nano_bed, gff_bed, core=30,
Expand Down
10 changes: 8 additions & 2 deletions trackcluster/pre.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def wrapper_bedtools_merge(bigg_file, out):
"""

# merge with strandness, report to col5
cmd="bedtools merge -nonamecheck -s -c 4 -o count -i {biggfile} > {out}".format(
# bedtools 2.26, can use -s -c 4 -o count
# bedtools 2.30, must use -s c 6 -o distinct,count
cmd="bedtools merge -nonamecheck -s -c 6 -o distinct,count -i {biggfile} > {out}".format(
biggfile=bigg_file, out=out
)
_=myexe(cmd)
Expand Down Expand Up @@ -118,7 +120,11 @@ def mergedbed2bigg(merge_bedfile, count_cutoff=5):
with open(merge_bedfile, "r") as f:
for line in f.readlines():
line_l=line.strip().split("\t")
chro, start, end, strand, count=line_l
# to deal with the bedtools merge version unmatch bug
# before 2.26, result for -s -c 6 -o distinct, count return 6 col chro, start, end, strand, strand, count
# after 2.30, return 5 col, chro, start, end, strand, count
chro, start, end, strand =line_l[0:4]
count=line_l[-1]

if int(count)>=count_cutoff:
# generate a chro_start_end string for name
Expand Down

0 comments on commit 88b7123

Please sign in to comment.