forked from Kinggerm/GetOrganelle
-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_organelle_reads.py
executable file
·2753 lines (2619 loc) · 160 KB
/
get_organelle_reads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
import datetime
import sys
import os
from copy import deepcopy
from optparse import OptionParser, OptionGroup
from VERSIONS import get_versions
PATH_OF_THIS_SCRIPT = os.path.split(os.path.realpath(__file__))[0]
sys.path.append(os.path.join(PATH_OF_THIS_SCRIPT, ".."))
from Library.pipe_control_func import *
from Library.seq_parser import *
PATH_OF_THIS_SCRIPT = os.path.split(os.path.realpath(__file__))[0]
NOT_REF_PATH = os.path.join(PATH_OF_THIS_SCRIPT, "Library", "NotationReference")
SEQ_REF_PATH = os.path.join(PATH_OF_THIS_SCRIPT, "Library", "SeqReference")
import time
MAJOR_VERSION, MINOR_VERSION = sys.version_info[:2]
if MAJOR_VERSION == 2 and MINOR_VERSION >= 7:
PYTHON_VERSION = "2.7+"
elif MAJOR_VERSION == 3 and MINOR_VERSION >= 5:
PYTHON_VERSION = "3.5+"
else:
sys.stdout.write("Python version have to be 2.7+ or 3.5+")
sys.exit(0)
if PYTHON_VERSION == "2.7+":
from commands import getstatusoutput
else:
from subprocess import getstatusoutput
import subprocess
DEAD_CODE = {"2.7+": 32512, "3.5+": 127}[PYTHON_VERSION]
MAX_RATIO_RL_WS = 0.75
GLOBAL_MIN_WS = 49
def get_options(descriptions, version):
version = version
usage = "\n### Plant Plastome, Normal, 2*(1G raw data, 150 bp) reads\n" + str(os.path.basename(__file__)) + \
" -1 sample_1.fq -2 sample_2.fq -s cp_reference.fasta -o plastome_output " \
" -R 15 -k 75,85,95,105 -F plant_cp\n" \
"### Plant Mitochondria\n" + str(os.path.basename(__file__)) + \
" -1 sample_1.fq -2 sample_2.fq -s mt_reference.fasta -o mitochondria_output " \
" -R 30 -k 65,75,85,95,105 -F plant_mt\n" \
"### Plant Nuclear Ribosomal RNA (18S-ITS1-5.8S-ITS2-26S)\n" + str(os.path.basename(__file__)) + \
" -1 sample_1.fq -2 sample_2.fq -o nr_output " \
" -R 7 -k 95,105,115 -P 0 -F plant_nr"
parser = OptionParser(usage=usage, version=version, description=descriptions, add_help_option=False)
# group 1
group_inout = OptionGroup(parser, "IN-OUT OPTIONS", "Options on inputs and outputs")
group_inout.add_option("-1", dest="fq_file_1",
help="Input file with forward paired-end reads to be added to the pool.")
group_inout.add_option("-2", dest="fq_file_2",
help="Input file with reverse paired-end reads to be added to the pool.")
group_inout.add_option("-u", dest="unpaired_fq_files",
help="Input file(s) with unpaired (single-end) reads to be added to the pool. "
"files could be comma-separated lists such as 'seq1,seq2'.")
group_inout.add_option("-o", dest="output_base", help="Output directory. Overwriting files if directory exists.")
group_inout.add_option("-s", dest="seed_file", default=os.path.join(SEQ_REF_PATH, "*.fasta"),
help="Reference. Input fasta format file as initial seed. "
"A reference sequence in GetOrganelle is only used for identifying initial "
"organelle reads. The assembly process is purely de novo."
"Default: '%default' (* depends on the value followed with flag '-F')")
group_inout.add_option("-a", dest="anti_seed",
help="Anti-reference. Not suggested unless what you really know what you are doing. "
"Input fasta format file as anti-seed, where the extension process "
"stop. Typically serves as excluding chloroplast reads when extending mitochondrial "
"reads, or the other way around. You should be cautious about using this option, "
"because if the anti-seed includes some word in the target but not in the seed, "
"the result would have gaps. For example, use the plant_mt and plant_cp from the same "
"plant-species as seed and anti-seed.")
group_inout.add_option("--max-reads", dest="maximum_n_reads", type=float, default=1.5E7,
help="Maximum number of reads to be used per file. "
"Default: 1.5E7 (-F plant_cp/plant_nr/animal_mt/fungus_mt) "
"or 7.5E7 (-F plant_mt/anonym)")
group_inout.add_option("--max-ignore-percent", dest="maximum_ignore_percent", type=float, default=0.01,
help="The maximum percent of bases to be ignore in extension, due to low quality. "
"Default: %default")
group_inout.add_option("--min-quality-score", dest="min_quality_score", type=int, default=1,
help="Minimum quality score in extension. This value would be automatically increased "
"to prevent ignoring too much raw data (see --max-ignore-percent)."
"Default: %default ('\"' in Phred+33; 'A' in Phred+64/Solexa+64)")
group_inout.add_option("--prefix", dest="prefix", default="",
help="Add extra prefix to resulting files under the output directory.")
group_inout.add_option("--out-per-round", dest="fg_out_per_round", action="store_true", default=False,
help="Enable output per round. Choose to save memory but cost more time per round.")
group_inout.add_option("--keep-temp", dest="keep_temp_files", action="store_true", default=False,
help="Choose to keep the running temp/index files.")
# group 2
group_scheme = OptionGroup(parser, "SCHEME OPTIONS", "Options on running schemes.")
group_scheme.add_option("-F", dest="organelle_type",
help="This flag should be followed with plant_cp (chloroplast), plant_mt "
"(plant mitochondrion), plant_nr (plant nuclear ribosomal RNA), animal_mt "
"(animal mitochondrion), fungus_mt (fungus mitochondrion), "
"or anonym (uncertain organelle genome type, with customized gene database "
"('--genes'), which is suggested only when the above database is genetically distant "
"from your sample).")
group_scheme.add_option("--safe", dest="safe_strategy", default=False, action="store_true",
help="=\"-R 200 --max-reads 2E8 -J 1 -M 1 --min-quality-score -5 --max-ignore-percent 0 "
"--max-n-words 4E9 -k 55,65,75,85,95,105,115,125\" "
"This option is suggested for lowly-covered or nonhomogeneously-covered data (poor "
"data). You can overwrite the value of a specific option listed above by adding "
"that option along with the \"--safe\" flag. "
"Intensive resources (time and data) would be utilized for a complete result. "
"If you can already get the circular/complete result from other option combinations, "
"you do not need to use this option, because the circular result by GetOrganelle is "
"generally consistent under different options. If you still can not get a complete "
"(say a circular plastome) under this option, you need to adjust '-w' and '-k' "
"carefully to confirm whether it was the problem of the dataset. ")
group_scheme.add_option("--fast", dest="fast_strategy", default=False, action="store_true",
help="=\"-R 10 --max-reads 5E6 -t 4 -J 5 -M 7 --max-n-words 3E7 --larger-auto-ws\" "
"This option is suggested for homogeneously and highly covered data (very fine data). "
"You can overwrite the value of a specific option listed above by adding "
"that option along with the \"--fast\" flag. "
"You could try GetOrganelle with this option for a list of samples and run a second "
"time without this option for the rest with incomplete results. ")
group_scheme.add_option("--memory-save", dest="memory_save", default=False, action="store_true",
help="=\"--out-per-round -P 0 --remove-duplicates 0\" "
"You can overwrite the value of a specific option listed above by adding "
"that option along with the \"--memory-save\" flag. ")
group_scheme.add_option("--memory-unlimited", dest="memory_unlimited", default=False, action="store_true",
help="=\"-P 1E7 --index-in-memory --remove-duplicates 2E8 "
"--min-quality-score -5 --max-ignore-percent 0\" "
"You can overwrite the value of a specific option listed above by adding "
"that option along with the \"--memory-unlimited\" flag. ")
# group 3
group_extending = OptionGroup(parser, "EXTENDING OPTIONS", "Options on the performance of extending process")
group_extending.add_option("-w", dest="word_size", type=float,
help="Word size (W) for pre-grouping (if not assigned by '--pre-w') and extending "
"process. This script would try to guess (auto-estimate) a proper W "
"using an empirical function based on average read length, reads quality, "
"target genome coverage, and other variables that might influence the extending "
"process. You could assign the ratio (1>input>0) of W to "
"read_length, based on which this script would estimate the W for you; "
"or assign an absolute W value (read length>input>=35). Default: auto-estimated.")
group_extending.add_option("--pre-w", dest="pregroup_word_size", type=float,
help="Word size (W) for pre-grouping. Used to reproduce result when word size is "
"a certain value during pregrouping process and later changed during reads "
"extending process. Similar to word size. Default: auto-estimated.")
group_extending.add_option("-R", "--max-rounds", dest="max_rounds", type=int, default=100,
help="Maximum number of running rounds (>=2). Default: %default.")
group_extending.add_option("-r", "--min-rounds", dest="min_rounds", type=int, default=5,
help="Minimum number of running rounds (>=1). If 'auto_word_size_step' is enabled "
"(see '--auto-wss' for more) and extending stopped before finishing the designed "
"minimum rounds, automatically restart extending with a smaller word size. "
"Default: %default")
group_extending.add_option("--max-n-words", dest="maximum_n_words", type=float, default=4E8,
help="Maximum number of words to be used in total."
"Default: 4E8 (-F plant_cp), 8E7 (-F plant_nr/animal_mt/fungus_mt), "
"2E9 (-F plant_mt)")
group_extending.add_option("-J", dest="jump_step", type=int, default=3,
help="The wide of steps of checking words in reads during extending process "
"(integer >= 1). When you have reads of high quality, the larger the number is, "
"the faster the extension will be, "
"the more risk of missing reads in low coverage area. "
"Choose 1 to choose the slowest but safest extension strategy. Default: %default")
group_extending.add_option("-M", dest="mesh_size", type=int, default=2,
help="(Beta parameter) "
"The wide of steps of building words from seeds during extending process "
"(integer >= 1). When you have reads of high quality, the larger the number is, "
"the faster the extension will be, "
"the more risk of missing reads in low coverage area. "
"Another usage of this mesh size is to choose a larger mesh size coupled with a "
"smaller word size, which makes smaller word size feasible when memory is limited."
"Choose 1 to choose the slowest but safest extension strategy. Default: %default")
group_extending.add_option("--no-bowtie2", dest="utilize_mapping", action="store_false", default=True,
help="Choose to disable mapping process."
"By default, this script would map original reads to pre-seed (or bowtie2 index) "
"to acquire the initial seed. This requires bowtie2 to be installed "
"(and thus Linux/Unix only). It is suggested to keep mapping as default "
"when the seed is too diverse from target.")
group_extending.add_option("--bowtie2-options", dest="bowtie2_options", default="--very-fast -t",
help="Bowtie2 options, such as '--ma 3 --mp 5,2 --very-fast -t'. Default: %default.")
group_extending.add_option("--rough-pre-reading", dest="pre_reading", default=True, action="store_false",
help="Roughly pre-reading the read characteristics, in which case a user-defined "
"word size should be given. Default: Fa")
group_extending.add_option("--auto-wss", dest="auto_word_size_step", default=0, type=int,
help="The step of word size adjustment during extending process."
"Use 0 to disable this automatic adjustment (NOT suggested). Default: %default.")
group_extending.add_option("--larger-auto-ws", dest="larger_auto_ws", default=False, action="store_true",
help="By using this flag, the empirical function for estimating W would tend to "
"produce a relative larger W, which would speed up the matching in extending, "
"reduce the memory cost in extending, but increase the risk of broken final "
"graph. Suggested when the data is good with high and homogenous coverage.")
group_extending.add_option("--soft-max-words", dest="soft_max_words", type=float, default=8E7,
help="Maximum number of words to be used to test the fitness of a word size value."
"Default: 8E7 (-F plant_cp), 1.6E7 (-F plant_nr/animal_mt/fungus_mt), "
"4E8 (-F plant_mt)")
group_extending.add_option("--target-genome-size", dest="target_genome_size", default=130000, type=int,
help="Hypothetical value of target genome size. This is only used for estimating "
"word size when no '-w word_size' is given. No need to be precise."
"Default: 130000 (-F plant_cp) or 390000 (-F plant_mt/anonym) or "
"65000 (-F fungus_mt) or 13000 (-F plant_nr/animal_mt) ")
# group 4
group_assembly = OptionGroup(parser, "ASSEMBLY OPTIONS", "These options are about the assembly and "
"graph disentangling")
group_assembly.add_option("-k", dest="spades_kmer", default="75,85,95",
help="SPAdes kmer settings. Use the same format as in SPAdes. illegal kmer values "
"would be automatically discarded by GetOrganelle. "
"Default: %default")
group_assembly.add_option("--spades-options", dest="other_spades_options", default="",
help="Other SPAdes options. Use double quotation marks to include all the arguments "
"and parameters.")
group_assembly.add_option("--no-spades", dest="run_spades", action="store_false", default=True,
help="Disable SPAdes.")
group_assembly.add_option("--no-gradient-k", dest="auto_gradient_k", action="store_false", default=True,
help="Disable adding an extra set of kmer values according to word size.")
group_assembly.add_option("--genes", dest="genes_fasta",
help="Customized database (fasta format) containing ONE set of protein coding genes "
"and ribosomal RNAs from ONE reference genome. "
"This database is only used for identifying/tagging target organelle contigs, "
"while the gene order of output, meaning the scaffolding/disentangling process "
"is purely decided by the topology of de Bruijn graph and the coverage info. "
"Used when '-F anonym'.")
group_assembly.add_option("--disentangle-df", dest="disentangle_depth_factor", default=10.0, type=float,
help="Depth factor for differentiate genome type of contigs. "
"The genome type of contigs are determined by blast. "
"You can also make the blast index by your self and add those index to '" +
NOT_REF_PATH + "'. Default: %default")
group_assembly.add_option("--contamination-depth", dest="contamination_depth", default=5., type=float,
help="Depth factor for confirming contamination in parallel contigs. Default: %default")
group_assembly.add_option("--contamination-similarity", dest="contamination_similarity", default=0.9, type=float,
help="Similarity threshold for confirming contaminating contigs. Default: %default")
group_assembly.add_option("--no-degenerate", dest="degenerate", default=True, action="store_false",
help="Disable making consensus from parallel contig based on nucleotide degenerate table.")
group_assembly.add_option("--degenerate-depth", dest="degenerate_depth", default=1.5, type=float,
help="Depth factor for confirming parallel contigs. Default: %default")
group_assembly.add_option("--degenerate-similarity", dest="degenerate_similarity", default=0.98, type=float,
help="Similarity threshold for confirming parallel contigs. Default: %default")
group_assembly.add_option("--disentangle-time-limit", dest="disentangle_time_limit", default=600, type=int,
help="Time limit (second) for each try of disentangling a graph file as a circular "
"genome. Disentangling a graph as contigs is not limited. Default: %default")
# group 5
group_computational = OptionGroup(parser, "ADDITIONAL OPTIONS", "")
group_computational.add_option("-t", dest="threads", type=int, default=1,
help="Maximum threads to use.")
group_computational.add_option("-P", dest="pre_grouped", type=float, default=2E5,
help="The maximum number (integer) of high-covered reads to be pre-grouped "
"before extending process. pre_grouping is suggested when the whole genome "
"coverage is shallow but the organ genome coverage is deep. "
"The default value is 2E5. "
"For personal computer with 8G memory, we suggest no more than 3E5. "
"A larger number (ex. 6E5) would run faster but exhaust memory "
"in the first few minutes. Choose 0 to disable this process.")
group_computational.add_option("--continue", dest="script_resume", default=False, action="store_true",
help="Several check points based on files produced, rather than log, "
"so keep in mind that this script will not detect the difference "
"between this input parameters and the previous ones.")
group_computational.add_option("--index-in-memory", dest="index_in_memory", action="store_true", default=False,
help="Keep index in memory. Choose save index in memory than in disk.")
group_computational.add_option("--remove-duplicates", dest="rm_duplicates", default=1E7, type=float,
help="By default this script use unique reads to extend. Choose the number of "
"duplicates (integer) to be saved in memory. A larger number (ex. 2E7) would "
"run faster but exhaust memory in the first few minutes. "
"Choose 0 to disable this process. "
"Note that whether choose or not will not disable "
"the calling of replicate reads. Default: %default.")
group_computational.add_option("--flush-frequency", dest="echo_frequency", default=54321, type=int,
help="Flush frequency for presenting progress. Default: %default")
group_computational.add_option("--verbose", dest="verbose_log", action="store_true", default=False,
help="Verbose output. Choose to enable verbose running log.")
parser.add_option("-h", dest="simple_help", default=False, action="store_true",
help="print brief introduction for frequently-used options.")
parser.add_option("--help", dest="verbose_help", default=False, action="store_true",
help="print verbose introduction for all options.")
if "--help" in sys.argv:
parser.add_option_group(group_inout)
parser.add_option_group(group_scheme)
parser.add_option_group(group_extending)
parser.add_option_group(group_assembly)
parser.add_option_group(group_computational)
parser.print_help()
exit()
elif "-h" in sys.argv:
for not_often_used in ("-a", "--max-reads", "--max-ignore-percent",
"--min-quality-score", "--prefix", "--out-per-round", "--keep-temp",
"--memory-save", "--memory-unlimited", "--pre-w", "-r", "--max-n-words",
"-J", "-M", "--no-bowtie2", "--bowtie2-options", "--rough-pre-reading",
"--auto-wss", "--larger-auto-ws",
"--soft-max-words", "--target-genome-size", "--spades-options", "--no-spades",
"--no-gradient-k", "--genes", "--disentangle-df", "--contamination-depth",
"--contamination-similarity", "--no-degenerate",
"--degenerate-depth", "--degenerate-similarity", "--disentangle-time-limit",
"--continue", "--index-in-memory",
"--remove-duplicates", "--flush-frequency", "--verbose"):
parser.remove_option(not_often_used)
parser.remove_option("-1")
parser.add_option("-1", dest="fq_file_1", help="Input file with forward paired-end reads.")
parser.remove_option("-2")
parser.add_option("-2", dest="fq_file_2", help="Input file with reverse paired-end reads.")
parser.remove_option("-u")
parser.add_option("-u", dest="unpaired_fq_files", help="Input file(s) with unpaired (single-end) reads. ")
parser.remove_option("-o")
parser.add_option("-o", dest="output_base", help="Output directory.")
parser.remove_option("-s")
parser.add_option("-s", dest="seed_file", help="Reference. Input fasta format file as initial seed. "
"Default: " + os.path.join(SEQ_REF_PATH, "*.fasta"))
parser.remove_option("-w")
parser.add_option("-w", dest="word_size", help="Word size (W) for extension. Default: auto-estimated")
parser.remove_option("-R")
parser.add_option("-R", dest="max_rounds", help="Maximum running rounds (>=2). Default: unlimited")
parser.remove_option("-F")
parser.add_option("-F", dest="organelle_type", default="plant_cp",
help="Target organelle genome type: "
"plant_cp/plant_mt/plant_nr/animal_mt/fungus_mt/anonym. Default: %default")
parser.remove_option("--safe")
parser.add_option("--safe", dest="safe_strategy",
help="=\"-R 200 --max-reads 2E8 -J 1 -M 1 --min-quality-score -5 --max-ignore-percent 0 "
"--max-n-words 4E9 -k 55,65,75,85,95,105,115,125 "
"--disentangle-time-limit 3600\"")
parser.remove_option("--fast")
parser.add_option("--fast", dest="fast_strategy",
help="=\"-R 10 --max-reads 5E6 -t 4 -J 5 -M 7 --max-words 3E7 --larger-auto-ws "
"--disentangle-time-limit 180 --no-gradient-k\"")
parser.remove_option("-k")
parser.add_option("-k", dest="spades_kmer", default="75,85,95", help="SPAdes kmer settings. Default: %default")
parser.remove_option("-t")
parser.add_option("-t", dest="threads", type=int, default=1, help="Maximum threads to use. Default: %default")
parser.remove_option("-P")
parser.add_option("-P", dest="pre_grouped", default=int(2E5), help="Pre-grouping value. Default: %default")
parser.print_help()
sys.stdout.write("\n")
exit()
else:
parser.add_option_group(group_inout)
parser.add_option_group(group_scheme)
parser.add_option_group(group_extending)
parser.add_option_group(group_assembly)
parser.add_option_group(group_computational)
try:
(options, args) = parser.parse_args()
except Exception as e:
sys.stdout.write("\n############################################################################" + str(e))
sys.stdout.write("\n\"-h\" for more usage")
exit()
else:
if not (options.seed_file # or options.bowtie2_seed)
and ((options.fq_file_1 and options.fq_file_2) or options.unpaired_fq_files)
and options.output_base and options.organelle_type):
sys.stdout.write("\n############################################################################"
"\nERROR: Insufficient arguments!\nUsage:")
sys.stdout.write(usage + "\n\n")
exit()
if int(bool(options.fq_file_1)) + int(bool(options.fq_file_2)) == 1:
sys.stdout.write("\n############################################################################"
"\nERROR: unbalanced paired reads!\n\n")
exit()
if options.organelle_type not in {"plant_cp", "plant_mt", "plant_nr", "animal_mt", "fungus_mt", "anonym"}:
sys.stdout.write("\n############################################################################"
"\nERROR: \"-F\" MUST be one of 'plant_cp', 'plant_mt', 'plant_nr', "
"'animal_mt', 'fungus_mt', 'anonym'!\n\n")
if "*" in options.seed_file:
if options.organelle_type == "anonym":
sys.stdout.write("\n############################################################################"
"\nERROR: \"-s\" and \"--genes\" must be specified when \"-F anonym\"!\n\n")
exit()
elif options.organelle_type == "animal_mt":
sys.stdout.write("\n############################################################################"
"\nERROR: Currently \"-s\" and \"--genes\" must be specified when "
"\"-F animal_mt\"!\n\n")
exit()
else:
options.seed_file = options.seed_file.replace("*", options.organelle_type)
if options.organelle_type in {"anonym", "animal_mt"} and not options.genes_fasta:
sys.stdout.write("\n############################################################################"
"\nERROR: \"-s\" and \"--genes\" must be specified when \"-F anonym\"!\n\n")
exit()
for check_file in (options.fq_file_1, options.fq_file_2, options.seed_file, options.anti_seed):
if check_file:
if not os.path.exists(check_file):
sys.stdout.write("\n############################################################################"
"\nERROR: " + check_file + " not found!\n\n")
exit()
if options.unpaired_fq_files:
options.unpaired_fq_files = options.unpaired_fq_files.split(",")
for fastq_file in options.unpaired_fq_files:
if not os.path.exists(fastq_file):
sys.stdout.write("\n############################################################################"
"\nERROR: " + fastq_file + " not found!\n\n")
exit()
else:
options.unpaired_fq_files = []
if options.jump_step < 1:
sys.stdout.write("\n############################################################################"
"\nERROR: Jump step MUST be an integer that >= 1")
exit()
if options.mesh_size < 1:
sys.stdout.write("\n############################################################################"
"\nERROR: Mesh size MUST be an integer that >= 1")
exit()
if options.fq_file_1 == options.fq_file_2 and options.fq_file_1:
sys.stdout.write("\n############################################################################"
"\nERROR: 1st fastq file is the same with 2nd fastq file!")
exit()
if options.safe_strategy and options.fast_strategy:
sys.stdout.write("\n############################################################################"
"\nERROR: \"--safe\" and \"--fast\" are not compatible!")
if options.memory_save and options.memory_unlimited:
sys.stdout.write("\n############################################################################"
"\nERROR: \"--memory-save\" and \"--memory-unlimited\" are not compatible!")
options.prefix = os.path.basename(options.prefix)
if options.script_resume and os.path.isdir(options.output_base):
previous_attributes = LogInfo(options.output_base, options.prefix)
else:
previous_attributes = None
options.script_resume = False
if not os.path.isdir(options.output_base):
os.mkdir(options.output_base)
options.script_resume = False
log = simple_log(logging.getLogger(), options.output_base, options.prefix + "get_org.")
log.info("")
log.info(descriptions)
log.info("Python " + sys.version)
log.info(" ".join(sys.argv) + "\n")
log = timed_log(log, options.output_base, options.prefix + "get_org.")
if options.word_size is None:
# if options.auto_word_size_step:
# pass
# else:
# log.error("Do not turn off auto_word_size_step if no input initial word size!")
# exit()
pass
elif 0 < options.word_size < 1:
pass
elif options.word_size >= GLOBAL_MIN_WS:
options.word_size = int(options.word_size)
else:
log.error("Illegal word size (\"-w\") value!")
exit()
if options.pregroup_word_size:
if 0 < options.pregroup_word_size < 1:
pass
elif options.pregroup_word_size >= GLOBAL_MIN_WS:
options.pregroup_word_size = int(options.pregroup_word_size)
else:
log.error("Illegal word size (\"--pre-w\") value!")
exit()
if options.safe_strategy:
if "-R" not in sys.argv and "--max-rounds" not in sys.argv:
options.max_rounds = 200
if "--max-reads" not in sys.argv:
options.maximum_n_reads = 2E8
if "-J" not in sys.argv:
options.jump_step = 1
if "-M" not in sys.argv:
options.mesh_size = 1
if "--min-quality-score" not in sys.argv:
options.min_quality_score = -5
if "--max-ignore-percent" not in sys.argv:
options.maximum_ignore_percent = 0
# if "--auto-wss" not in sys.argv:
# options.auto_word_size_step = 2
if "--max-n-words" not in sys.argv:
options.maximum_n_words = 4E9
if "-k" not in sys.argv:
options.spades_kmer = "55,65,75,85,95,105,115,125"
if "--disentangle-time-limit" not in sys.argv:
options.disentangle_time_limit = 3600
if options.fast_strategy:
if "-R" not in sys.argv and "--max-rounds" not in sys.argv:
options.max_rounds = 10
if "--max-reads" not in sys.argv:
options.maximum_n_reads = 5E6
if "-t" not in sys.argv:
options.threads = 4
if "-J" not in sys.argv:
options.jump_step = 5
if "-M" not in sys.argv:
options.mesh_size = 7
if "--max-n-words" not in sys.argv:
options.maximum_n_words = 3E7
options.larger_auto_ws = True
options.auto_gradient_k = False
if "--disentangle-time-limit" not in sys.argv:
options.disentangle_time_limit = 180
if options.memory_save:
if "-P" not in sys.argv:
options.pre_grouped = 0
if "--remove-duplicates" not in sys.argv:
options.rm_duplicates = 0
if options.memory_unlimited:
if "-P" not in sys.argv:
options.pre_grouped = 1E7
if "--remove-duplicates" not in sys.argv:
options.rm_duplicates = 2E8
if "--min-quality-score" not in sys.argv:
options.min_quality_score = -5
if "--max-ignore-percent" not in sys.argv:
options.maximum_ignore_percent = 0
options.auto_word_size_step = abs(options.auto_word_size_step)
if options.organelle_type in ("animal_mt", "fungus_mt"):
log.warning("Currently GetOrganelle had been tested on limited animal & fungus samples!")
log.warning("The default references for animal & fungus are for temporary usage!")
log.warning("No guarantee for success rate in animal & fungus samples!\n")
if options.organelle_type == "fungus_mt":
log.warning("Too improve this, a customized close-related reference (-s reference.fasta) with its "
"protein coding & ribosomal genes extracted (--genes reference.genes.fasta) is "
"highly suggested for your own animal & fungus samples!\n")
# using the default
if "--max-reads" not in sys.argv:
if options.organelle_type == ("plant_mt", "anonym"):
options.maximum_n_reads *= 5
if "--max-n-words" not in sys.argv:
if options.organelle_type in ("plant_mt", "anonym"):
options.maximum_n_words *= 5
elif options.organelle_type in ("plant_nr", "animal_mt", "fungus_mt"):
options.maximum_n_words /= 5
if "--soft-max-words" not in sys.argv:
if options.organelle_type in ("plant_mt", "anonym"):
options.soft_max_words *= 5
elif options.organelle_type in ("plant_nr", "animal_mt", "fungus_mt"):
options.soft_max_words /= 5
if "--target-genome-size" not in sys.argv:
if options.organelle_type == "plant_mt":
options.target_genome_size *= 3
elif options.organelle_type == "fungus_mt":
options.target_genome_size /= 2.
elif options.organelle_type in ("plant_nr", "animal_mt"):
options.target_genome_size /= 10.
elif options.organelle_type == "anonym":
ref_seqs = read_fasta(options.genes_fasta)[1]
options.target_genome_size = 2 * sum([len(this_seq) for this_seq in ref_seqs])
log.info("Setting '--target-genome-size " + str(options.target_genome_size) +
"' for estimating the word size value.")
if options.organelle_type in ("fungus_mt", "anonym"):
global MAX_RATIO_RL_WS
MAX_RATIO_RL_WS = 0.8
if not options.pre_reading and not options.word_size:
log.error("When '--rough-pre-reading' was chosen, a user defined word size value ('-w') must be given!")
exit()
if options.utilize_mapping:
if not executable("bowtie2"):
options.utilize_mapping = False
if options.seed_file:
log.warning('bowtie2 not in the path! Take the seed file as initial seed.')
else:
log.error('bowtie2 not in the path!')
exit()
if options.anti_seed:
log.warning('bowtie2 not in the path! Take the anti-seed file as initial anti-seed.')
else:
log.warning('bowtie2 not in the path! Anti-seed disabled!')
if not executable("bowtie2-build"):
options.utilize_mapping = False
if options.seed_file:
log.warning('bowtie2-build not in the path! Take the seed file as initial seed.')
else:
log.error('bowtie2-build not in the path!')
exit()
if options.anti_seed:
log.warning('bowtie2-build not in the path! Take the anti-seed file as initial anti-seed.')
else:
log.warning('bowtie2-build not in the path! Anti-seed disabled!')
if not executable("bowtie2-build-l"):
options.utilize_mapping = False
if options.seed_file:
log.warning('bowtie2-build-l not in the path! Take the seed file as initial seed.')
else:
log.error('bowtie2-build-l not in the path!')
exit()
if options.anti_seed:
log.warning('bowtie2-build-l not in the path! Take the anti-seed file as initial anti-seed.')
else:
log.warning('bowtie2-build-l not in the path! Anti-seed disabled!')
if options.run_spades:
if not executable("spades.py -h"):
log.warning("spades.py not found in the path. Only get the reads and skip assembly.")
options.run_spades = False
if not executable("blastn"):
log.warning("blastn not found in the path!")
if options.organelle_type == "anonym" and not executable("makeblastdb"):
log.warning("makeblastdb not found in the path!")
options.rm_duplicates = int(options.rm_duplicates)
options.pre_grouped = int(options.pre_grouped)
if not options.rm_duplicates and options.pre_grouped:
log.warning("removing duplicates was inactive, so that the pre-grouping was disabled.")
options.pre_grouped = False
if options.min_rounds < 1:
log.warning("illegal minimum rounds! Set to default: 1.")
options.min_rounds = 1
if options.max_rounds and options.max_rounds < 2:
log.warning("illegal maximum rounds! Set to minimum: 2")
options.max_rounds = 2
return options, log, previous_attributes
def combination_res(all_choices_num, chosen_num):
res = 1
for ch_n in range(chosen_num, 0, -1):
res *= all_choices_num - ch_n + 1
res /= ch_n
return res
def trans_word_cov(word_cov, base_cov, mean_base_error_rate, read_length):
wrong_words_percent = 0
for error_site_num in range(1, int(min(read_length * mean_base_error_rate * 10, read_length))):
prob_of_err_site_num = combination_res(read_length, error_site_num) \
* (mean_base_error_rate ** error_site_num) \
* ((1 - mean_base_error_rate) ** (read_length - error_site_num))
wrong_words_percent += (1 - 2 ** (-error_site_num)) * prob_of_err_site_num
# if word size < read_len/2, wrong words percent decreases
increase_word_cov = word_cov / (1 - wrong_words_percent) - word_cov
if word_cov > 0.5 * base_cov:
word_cov += increase_word_cov ** 0.34
elif word_cov + increase_word_cov > 0.5 * base_cov:
word_cov = 0.5 * base_cov + (word_cov + increase_word_cov - 0.5 * base_cov) ** 0.34
else:
word_cov += increase_word_cov
return word_cov
def estimate_word_size(base_cov_values, read_length, target_size, mean_error_rate=0.015, log=None,
max_discontinuous_prob=0.01, min_word_size=GLOBAL_MIN_WS, max_effective_word_cov=60,
wc_bc_ratio_constant=0.35):
echo_problem = False
all_word_sizes = []
for base_cov in base_cov_values:
# G: genome size, N: Number of reads from data, L: read length,
# ## Poisson distribution
# mean read cov = N/(G-L+1)
# expected # reads starting within any specific interval of C consecutive nucleotides = (N/(G-L+1))*C
# P(no read starts in the interval) = e^(-C*N/(G-L+1))
# P(>=1 reads start in the interval) = 1-e^(-C*N/(G-L+1))
# P(the interval is not continuous) = 1-(1-e^(-N/(G-L+1)))^C
#
# 1. The higher the base coverage is, the larger the word size should be. # to exclude unnecessary contigs.
# 2. The longer the read length is, the larger the word size should be
# 3. The higher the error rate is, the smaller the word size should be
# empirical functions:
word_cov = min(max_effective_word_cov, base_cov * wc_bc_ratio_constant)
min_word_cov = 5
while 1 - (1 - math.e ** (-min_word_cov)) ** target_size > max_discontinuous_prob:
min_word_cov += 0.05
wc_bc_ratio_max = 1 - (min_word_size - 1) / read_length
if base_cov * wc_bc_ratio_max < min_word_cov:
min_word_cov = base_cov * wc_bc_ratio_max
echo_problem = True
word_cov = max(min_word_cov, word_cov)
word_cov = trans_word_cov(word_cov, base_cov, mean_error_rate / 2., read_length)
# 1. relationship between kmer coverage and base coverage, k_cov = base_cov * (read_len - k_len + 1) / read_len
estimated_word_size = int(read_length * (1 - word_cov / base_cov)) + 1
estimated_word_size = min(int(read_length * MAX_RATIO_RL_WS), max(min_word_size, estimated_word_size))
all_word_sizes.append(int(round(estimated_word_size, 0)))
if echo_problem:
if log:
log.warning("Guessing that you are using too few data for assembly!")
log.warning("GetOrganelle is still trying ...")
else:
sys.stdout.write("Guessing that you are using too few data for assembly!\n")
sys.stdout.write("GetOrganelle is still trying ...\n")
return all_word_sizes
def calculate_word_size_according_to_ratio(word_size_ratio, mean_read_len, log):
if word_size_ratio < 1:
new_word_size = int(round(word_size_ratio * mean_read_len, 0))
if new_word_size < GLOBAL_MIN_WS:
new_word_size = GLOBAL_MIN_WS
log.warning("Too small ratio " + str(new_word_size) + ", setting '-w " + str(GLOBAL_MIN_WS) + "'")
else:
log.info("Setting '-w " + str(new_word_size) + "'")
return new_word_size
else:
max_ws = int(round(mean_read_len * 0.9))
if word_size_ratio > max_ws:
word_size_ratio = max_ws
log.warning("Too large word size for mean read length " + str(mean_read_len) +
", setting '-w " + str(word_size_ratio) + "'")
return word_size_ratio
def check_parameters(word_size, out_base, utilize_mapping, maximum_n_reads, original_fq_files,
organelle_type, all_bases, auto_word_size_step, mean_error_rate,
target_genome_size, mean_read_len, max_read_len, low_quality_pattern,
log, wc_bc_ratio_constant=0.35, larger_auto_ws=False):
if utilize_mapping:
coverage_info = get_coverage_from_sam(os.path.join(out_base, "seed_bowtie.sam"))
all_coverages = [coverage_info[ref][pos] for ref in coverage_info for pos in coverage_info[ref]
if coverage_info[ref][pos] > 2]
if not all_coverages:
all_coverages = [coverage_info[ref][pos] for ref in coverage_info for pos in coverage_info[ref]]
if not all_coverages:
if log:
log.error("No seed reads found!")
log.error("Please check your raw data or change your reference!")
exit()
base_cov_values = get_cover_range(all_coverages, guessing_percent=0.07) # top 0.07 from mapped reads
log.info("Estimated " + organelle_type + " base-coverage = " + str(base_cov_values[1]))
if base_cov_values[0] < 200 and organelle_type in {"animal_mt", "fungus_mt", "anonym"}:
log.info("Re-estimating " + organelle_type + " base-coverage using word frequency counting ...")
counting_word_size = min(49, 2 * int(mean_read_len * 0.35) - 1)
smp_percent = min(1., max(0.2, 2E7/all_bases)) # at least 20M raw data should be used
words_dict = chop_seqs_as_empty_dict(
fq_seq_simple_generator(os.path.join(out_base, "Initial.mapped.fq")), word_size=counting_word_size)
counting_words(fq_seq_simple_generator(original_fq_files,
max_n_reads=all_bases/mean_read_len * smp_percent),
words_initial_dict=words_dict, word_size=counting_word_size)
all_word_coverages = []
for this_word in list(words_dict):
if this_word in words_dict:
reverse_word = complementary_seq(this_word)
if reverse_word == this_word:
all_word_coverages.append(words_dict.pop(this_word))
else:
all_word_coverages.append(words_dict.pop(this_word) + words_dict.pop(reverse_word))
word_cov_values = get_cover_range(all_word_coverages, guessing_percent=0.5) # mean from kmer counting
base_cov_values = [round(this_word_cov * mean_read_len / (mean_read_len - counting_word_size + 1)
/ smp_percent, 2)
for this_word_cov in word_cov_values]
log.info("Estimated " + organelle_type + " base-coverage = " + str(base_cov_values[1]))
else:
organelle_base_percent = 0.05
guessing_base_cov = organelle_base_percent * all_bases / target_genome_size
base_cov_values = [guessing_base_cov * 0.8, guessing_base_cov, guessing_base_cov * 1.2]
log.info("Guessing " + organelle_type + " base-coverage = " + str(base_cov_values[1]))
if word_size is None:
if larger_auto_ws:
min_word_size, word_size, max_word_size = estimate_word_size(
base_cov_values=base_cov_values, read_length=mean_read_len, target_size=target_genome_size,
max_discontinuous_prob=0.05, min_word_size=70,
mean_error_rate=mean_error_rate, log=log, wc_bc_ratio_constant=wc_bc_ratio_constant - 0.03)
else:
# standard dev
min_word_size, word_size, max_word_size = estimate_word_size(
base_cov_values=base_cov_values, read_length=mean_read_len, target_size=target_genome_size,
max_discontinuous_prob=0.01, min_word_size=GLOBAL_MIN_WS,
mean_error_rate=mean_error_rate, log=log, wc_bc_ratio_constant=wc_bc_ratio_constant)
log.info("Setting '-w " + str(word_size) + "'")
log.info("The automatically-estimated word size does not ensure the best choice.")
log.info("If the result graph is not a circular organelle genome, ")
log.info("you could adjust the word size for another new run.")
elif word_size < 1:
new_word_size = int(round(word_size * mean_read_len, 0))
if new_word_size < GLOBAL_MIN_WS:
word_size = GLOBAL_MIN_WS
log.warning("Too small ratio " + str(word_size) + ", setting '-w " + str(GLOBAL_MIN_WS) + "'")
else:
word_size = new_word_size
log.info("Setting '-w " + str(word_size) + "'")
min_word_size = max(GLOBAL_MIN_WS, word_size - auto_word_size_step * 4)
max_word_size = min(word_size + auto_word_size_step * 4, int(mean_read_len * MAX_RATIO_RL_WS))
else:
min_word_size = max(GLOBAL_MIN_WS, word_size - auto_word_size_step * 4)
max_word_size = min(word_size + auto_word_size_step * 4, int(mean_read_len * MAX_RATIO_RL_WS))
# arbitrarily adjust word size range
if not auto_word_size_step:
min_word_size = max_word_size = word_size
else:
max_word_size = max(int(mean_read_len * (1 - wc_bc_ratio_constant)), max_word_size)
if float(word_size) / max_read_len <= 0.5 and len(low_quality_pattern) > 2:
keep_seq_parts = True
else:
keep_seq_parts = False
return min_word_size, word_size, max_word_size, keep_seq_parts
def check_kmers(kmer_str, auto_gradient_k, word_s, max_r_len, log):
if kmer_str:
# delete illegal kmer
kmer_values = [int(kmer_v) for kmer_v in kmer_str.split(",")]
if auto_gradient_k:
# add kmer around estimated word_size
k_tens_digits = [int(kmer_v / 10) for kmer_v in kmer_values]
w_tens_digit = int(word_s / 10)
if w_tens_digit not in k_tens_digits:
# kmer_values.append(int(w_tens_digit * 10 + 5))
avail_tens_digits = k_tens_digits + [w_tens_digit]
for add_tens_digit in range(min(avail_tens_digits), max(avail_tens_digits)):
if add_tens_digit not in k_tens_digits:
kmer_values.append(int(add_tens_digit * 10 + 5))
# delete illegal kmer
kmer_values = [kmer_v for kmer_v in kmer_values if 21 <= kmer_v <= min(max_r_len, 127)]
spades_kmer = ",".join([str(kmer_v) for kmer_v in sorted(kmer_values)])
log.info("Setting '-k " + spades_kmer + "'")
return spades_kmer
else:
return None
# test whether an external binary is executable
def executable(test_this):
return True if os.access(test_this, os.X_OK) or getstatusoutput(test_this)[0] != DEAD_CODE else False
try:
import psutil
except ImportError:
this_process = None
else:
this_process = psutil.Process(os.getpid())
def get_read_len_mean_max_count(fq_files, maximum_n_reads, sampling_percent=1.):
read_lengths = []
all_count = 0
if sampling_percent == 1:
for fq_f in fq_files:
count_r = 0
for seq in fq_seq_simple_generator(fq_f):
count_r += 1
read_lengths.append(len(seq.strip("N")))
if count_r >= maximum_n_reads:
break
all_count += count_r
else:
sampling_percent = int(1 / sampling_percent)
for fq_f in fq_files:
count_r = 0
for seq in fq_seq_simple_generator(fq_f):
count_r += 1
if count_r % sampling_percent == 0:
read_lengths.append(len(seq.strip("N")))
if count_r >= maximum_n_reads:
break
all_count += count_r
return sum(read_lengths)/len(read_lengths), max(read_lengths), all_count
def get_read_quality_info(fq_files, maximum_n_reads, min_quality_score, log,
maximum_ignore_percent=0.05, sampling_percent=0.1):
sampling_percent = int(1 / sampling_percent)
all_quality_chars_list = []
record_fq_beyond_read_num_limit = []
for fq_f in fq_files:
count_r = 0
record_fq_beyond_read_num_limit.append(False)
this_fq_generator = fq_seq_simple_generator(fq_f, go_to_line=3)
for quality_str in this_fq_generator:
if count_r % sampling_percent == 0:
all_quality_chars_list.append(quality_str)
count_r += 1
if count_r >= maximum_n_reads:
break
for quality_str in this_fq_generator:
if quality_str:
log.info("Number of reads exceeded " + str(int(maximum_n_reads)) + " in " + os.path.basename(fq_f)
+ ", only top " + str(int(maximum_n_reads))
+ " reads are used in downstream analysis.")
record_fq_beyond_read_num_limit[-1] = True
break
all_quality_chars = "".join(all_quality_chars_list)
len_quality_chars_total = float(len(all_quality_chars))
max_quality = max(all_quality_chars)
min_quality = min(all_quality_chars)
max_quality = ord(max_quality)
min_quality = ord(min_quality)
decision_making = []
for type_name, char_min, char_max, score_min, score_max in [("Sanger", 33, 73, 0, 40),
("Solexa", 59, 104, -5, 40),
("Illumina 1.3+", 64, 104, 0, 40),
("Illumina 1.5+", 67, 105, 3, 41),
("Illumina 1.8+", 33, 74, 0, 41)]:
decision_making.append((type_name, char_min, char_max, score_min, score_max,
(max_quality - char_max) ** 2 + (min_quality - char_min) ** 2))
the_form, the_c_min, the_c_max, the_s_min, the_s_max, deviation = sorted(decision_making, key=lambda x: x[-1])[0]
log.info("Identified quality encoding format = " + the_form)
if max_quality > the_c_max:
log.warning("Max quality score " + repr(chr(max_quality)) +
"(" + str(max_quality) + ":" + str(max_quality - the_c_min + the_s_min) +
") in your fastq file exceeds the usual boundary " + str((the_c_min, the_c_max)))
if min_quality < the_c_min:
log.warning("Min quality score " + repr(chr(min_quality)) +
"(" + str(min_quality) + ":" + str(min_quality - the_c_min + the_s_min) +
") in your fastq file is under the usual lower boundary " + str((the_c_min, the_c_max)))
# increase --min-quality-score if ignoring too much data.
low_quality_score = min(the_c_min, min_quality)
ignore_percent = 0
trimmed_quality_chars = []
while low_quality_score < the_c_min + min_quality_score - the_s_min:
ignore_this_time = all_quality_chars.count(chr(low_quality_score)) / len_quality_chars_total
ignore_percent += ignore_this_time
if ignore_percent > maximum_ignore_percent:
ignore_percent -= ignore_this_time
break
else:
trimmed_quality_chars.append(chr(low_quality_score))
low_quality_score += 1
if low_quality_score < the_c_min + min_quality_score - the_s_min:
log.info("Resetting '--min-quality-score " + str(low_quality_score + the_s_min - the_c_min) + "'")
if trimmed_quality_chars:
log.info("Trimming bases with qualities (" + "%.2f" % (ignore_percent * 100) + "%): " +
str(ord(min("".join(trimmed_quality_chars)))) + ".." + str(ord(max("".join(trimmed_quality_chars)))) +
" " + "".join(trimmed_quality_chars))
trimmed_quality_chars = "".join(trimmed_quality_chars)
# calculate mean error rate
all_quality_char_dict = {in_quality_char: all_quality_chars.count(in_quality_char)
for in_quality_char in set(all_quality_chars)}
error_prob_func = chose_error_prob_func[the_form]
mean_error_rate = sum([error_prob_func(in_quality_char) * all_quality_char_dict[in_quality_char]
for in_quality_char in all_quality_char_dict]) / len_quality_chars_total
log.info("Mean error rate = " + str(round(mean_error_rate, 4)))
return "[" + trimmed_quality_chars + "]", mean_error_rate, record_fq_beyond_read_num_limit # , post_trimming_mean
def write_fq_results(original_fq_files, accepted_contig_id, out_file_name, temp2_clusters_dir, fq_info_in_memory,
maximum_n_reads, verbose, index_in_memory, log, extra_accepted_lines=set()):
if verbose:
sys.stdout.write(' ' * 100 + '\b' * 100)
sys.stdout.flush()
log.info("Producing output ...")
log.info("reading indices ...")
accepted_lines = []
if index_in_memory:
# read cluster indices
for this_index in accepted_contig_id:
accepted_lines += fq_info_in_memory[1][this_index]
# produce the pair-end output
accepted_lines = set(accepted_lines)
else:
# read cluster indices
temp2_indices_file_in = open(temp2_clusters_dir, 'rU')
this_index = 0
for line in temp2_indices_file_in:
if this_index in accepted_contig_id:
accepted_lines += [int(x) for x in line.strip().split('\t')]
this_index += 1
accepted_lines = set(accepted_lines)
# add initial mapped read ids
for line_id in extra_accepted_lines:
accepted_lines.add(line_id)
# write by line
if verbose:
log.info("writing fastq lines ...")
post_reading = [open(fq_file, 'rU') for fq_file in original_fq_files]
files_out = [open(out_file_name + '_' + str(i + 1) + '.temp', 'w') for i in range(len(original_fq_files))]
line_count = 0
for i in range(len(original_fq_files)):
count_r = 0
line = post_reading[i].readline()
while line:
count_r += 1
if line_count in accepted_lines:
files_out[i].write(line)
for j in range(3):
files_out[i].write(post_reading[i].readline())
line_count += 1
line = post_reading[i].readline()
line_count += 1
else:
for j in range(4):
line = post_reading[i].readline()
line_count += 1
if count_r >= maximum_n_reads:
break
files_out[i].close()
post_reading[i].close()
del accepted_lines
for i in range(len(original_fq_files)):
os.rename(out_file_name + '_' + str(i + 1) + '.temp', out_file_name + '_' + str(i + 1) + '.fq')
if verbose:
log.info("writing fastq lines finished.")
def make_read_index(original_fq_files, direction_according_to_user_input, maximum_n_reads, rm_duplicates, output_base,
word_size, anti_lines, pre_grouped, index_in_memory, anti_seed, keep_seq_parts,
low_quality, echo_frequency, resume, log):
# read original reads
# line_cluster (list) ~ forward_reverse_reads
line_clusters = []
seq_duplicates = {}
forward_reverse_reads = []
line_count = 0
this_index = 0
do_split_low_quality = len(low_quality) > 2
#
name_to_line = {}
#
temp1_contig_dir = [os.path.join(output_base, k + 'temp.indices.1') for k in ("_", "")]
temp2_clusters_dir = [os.path.join(output_base, k + 'temp.indices.2') for k in ("_", "")]
cancel_seq_parts = True
if resume and os.path.exists(temp1_contig_dir[1]) and os.path.exists(temp2_clusters_dir[1]):
if pre_grouped or index_in_memory:
log.info("Reading existed indices for fastq ...")
#
if keep_seq_parts:
forward_reverse_reads = [x.strip().split("\t") for x in open(temp1_contig_dir[1], 'rU')]
cancel_seq_parts = True if max([len(x) for x in forward_reverse_reads]) == 1 else False