forked from LangilleLab/microbiome_helper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_metaphlan.pl
executable file
·153 lines (103 loc) · 3.88 KB
/
run_metaphlan.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/perl
use warnings;
use strict;
use File::Basename;
use Getopt::Long;
use Pod::Usage;
use Parallel::ForkManager;
my $metaphlan_dir='/home/mlangill/programs/metaphlan/';
my $metaphlan_script=$metaphlan_dir.'metaphlan.py';
my $metaphlan_db=$metaphlan_dir.'bowtie2db/mpa';
my $metaphlan_merge=$metaphlan_dir.'utils/merge_metaphlan_tables.py';
#location to store intermediate metaphlan files
my $metaphlan_out_dir='./metaphlan_out/';
my ($final_out_file,$parallel,$help);
my $res = GetOptions("output=s" => \$final_out_file,
"parallel:i"=>\$parallel,
"help"=>\$help,
)or pod2usage(2);
pod2usage(-verbose=>2) if $help;
pod2usage($0.': You must specify an output file.') unless defined $final_out_file;
my $cpu_count=0;
#if the option is set
if(defined($parallel)){
#option is set but with no value then use the max number of proccessors
if($parallel ==0){
#load this module dynamically
eval("use Sys::CPU;");
$cpu_count=Sys::CPU::cpu_count();
}else{
$cpu_count=$parallel;
}
}
my $pm = new Parallel::ForkManager($cpu_count);
#create output directory
system("mkdir -p $metaphlan_out_dir");
my @files=@ARGV;
my %paired_files;
my $gzipped=0;
foreach(@files){
my ($file,$dir,$suffix)=fileparse($_, qr/\.[^.]*/);
if($suffix eq '.gz'){
$gzipped=1;
}
my $name=$file;
if($file =~ /(.+)_R[1|2]_/){
$name=$1;
}
push(@{$paired_files{$name}},$_);
}
my @out_files = map ($metaphlan_out_dir.$_, keys %paired_files);
foreach my $name (keys %paired_files){
my $pid = $pm->start and next;
my $cat;
if ($gzipped){
$cat='zcat';
}else{
$cat='cat';
}
my $out_file=$metaphlan_out_dir.$name;
my $cmd=join(' ',$cat,@{$paired_files{$name}});
$cmd.=" | $metaphlan_script --input_type multifastq --bt2_ps sensitive-local --bowtie2db $metaphlan_db --no_map > $out_file";
print $cmd,"\n";
system($cmd);
$pm->finish;
}
#Wait for all samples to be processed
$pm->wait_all_children;
#merge metaphlan output
my $merge_cmd=$metaphlan_merge.' '.join(' ',@out_files).' > '.$final_out_file;
print $merge_cmd,"\n";
system($merge_cmd);
__END__
=head1 Name
run_metaphlan.pl - Provide a simpler way to run metaphlan
=head1 USAGE
run_metaphlan.pl [-p [<# proc>] -h] -o out.txt <list of fastq files>
E.g.
run_metaphlan.pl -o out.txt sample1.fastq sample2.fastq sample3.fastq
#shorter way of writing the same thing
run_metaphlan.pl -o out.txt *.fastq
#Run in parallel and use all CPUs
run_metaphlan.pl -o out.txt *.fastq -p
#Run in parallel limit to only 2 CPUs
run_metaphlan.pl -o out.txt *.fastq -p 2
#fastq files can be gzipped (note: all files must be either gzipped or not. Can't be a mix)
run_metaphlan.pl -o out.txt *.fastq.gz
#paired end files can be handled by concatentating them on the fly (files must have "_R1_" and "_R2_" within the file name)
run_metaphlan.pl -o out.txt sample1_R1_001.fastq.gz sample1_R2_001.fastq.gz sample2_R1_001.fastq.gz sample2_R2_001.fastq.gz
=head1 OPTIONS
=over 4
=item B<-d, --output <file>>
Mandatory. The name of the file for the merged data to be written to.
=item B<-p, --parallel [<# of proc>]>
Using this option without a value will use all CPUs on machine, while giving it a value will limit to that many CPUs. Without option only one CPU is used.
=item B<-h, --help>
Displays the entire help documentation.
=back
=head1 DESCRIPTION
B<run_metaphlan.pl> This script allows for easier running of the metaphlan pipeline for taxonomy assingment on metagenomic data. In particular it automates the running of multiple metagenomic samples and merges the data into a single output table. It handles combining paired end data by contatentating the files on the fly. It also handles gzipped fastq files without creating uncompressed intermediate files.
Before use make sure you have installed Bowtie2 and the metaphlan package.
=head1 AUTHOR
Morgan Langille, E<lt>morgan.g.i.langille@gmail.comE<gt>
=cut