forked from reefgenomics/SymPortal_framework
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_data_submission.py
executable file
·2243 lines (1865 loc) · 118 KB
/
create_data_submission.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import itertools
import subprocess
from dbApp.models import symportal_framework, data_set, reference_sequence, data_set_sample_sequence, analysis_type, analysis_group, data_set_sample, data_analysis, clade_collection, clade_collection_type
from multiprocessing import Queue, Process, Manager
from django import db
import pickle
import csv
import numpy as np
from collections import defaultdict
import shutil
import re
import json
import glob
from datetime import datetime
import sys
import pandas as pd
from output import div_output_pre_analysis_new_meta_and_new_dss_structure
from general import *
from distance import generate_within_clade_UniFrac_distances_samples, generate_within_clade_BrayCurtis_distances_samples
from plotting import generate_stacked_bar_data_submission, plot_between_sample_distance_scatter
def logQCErrorAndContinue(datasetsampleinstanceinq, samplename, errorreason):
print('Error in processing sample: {}'.format(samplename))
datasetsampleinstanceinq.finalUniqueSeqNum = 0
datasetsampleinstanceinq.finalTotSeqNum = 0
datasetsampleinstanceinq.initialProcessingComplete = True
datasetsampleinstanceinq.errorInProcessing = True
datasetsampleinstanceinq.errorReason = errorreason
datasetsampleinstanceinq.save()
return
def worker_initial_mothur(input_q, error_sample_list, wkd, dataSubID, debug):
# I am going to make it so that we do all of the screening of the below evalue cutoff seqs
# inline with the submission rather than spitting the sequences out at the end and having to re-do submissions.
# We will constantly update the symClade.fa and make a backup at each stage we update it. This can simply be
# a time stamped fasta.
# we will need to split up this worker function into several stages. The first will be the 'initial mothur'
'''This worker performs the pre-MED processing'''
dataSubInQ = data_set.objects.get(id=dataSubID)
for contigPair in iter(input_q.get, 'STOP'):
sampleName = contigPair.split('\t')[0].replace('[dS]', '-')
dataSetSampleInstanceInQ = data_set_sample.objects.get(name=sampleName, dataSubmissionFrom=dataSubInQ)
# Only process samples that have not already had this done.
# This should be handy in the event of crashed midprocessing
if dataSetSampleInstanceInQ.initialProcessingComplete == False:
###### NB We will always crop with the SYMVAR primers as they produce the shortest product
primerFwdSeq = 'GAATTGCAGAACTCCGTGAACC' # Written 5'-->3'
primerRevSeq = 'CGGGTTCWCTTGTYTGACTTCATGC' # Written 5'-->3'
oligoFile = [
r'#SYM_VAR_5.8S2',
'forward\t{0}'.format(primerFwdSeq),
r'#SYM_VAR_REV',
'reverse\t{0}'.format(primerRevSeq)
]
# Initial Mothur QC, making contigs, screening for ambiguous calls and homopolymers
# Uniqueing, discarding <2 abundance seqs, removing primers and adapters
sys.stdout.write('{0}: QC started\n'.format(sampleName))
currentDir = r'{0}/{1}/'.format(wkd, sampleName)
# Make the sample by sample directory that we will be working in
# this will be inside the wkd directory (the temp data directory for the data_set submission)
os.makedirs(currentDir, exist_ok=True)
# We also need to make the same sample by sample directories for the pre MED sequence dump
os.makedirs(currentDir.replace('tempData', 'pre_MED_seqs'), exist_ok=True)
stabilityFile = [contigPair]
stabilityFileName = r'{0}{1}'.format(sampleName, 'stability.files')
rootName = r'{0}stability'.format(sampleName)
stabilityFilePath = r'{0}{1}'.format(currentDir, stabilityFileName)
# write out the stability file. This will be a single pair of contigs with a sample name
writeListToDestination(stabilityFilePath, stabilityFile)
# Write oligos file to directory. This file contains the primer sequences used for PCR cropping
writeListToDestination('{0}{1}'.format(currentDir, 'primers.oligos'), oligoFile)
# NB mothur is working very strangely with the python subprocess command. For some
# reason it is adding in an extra 'mothur' before the filename in the input directory
# As such we will have to enter all of the paths to files absolutely
# The mothur batch file that will be run by mothur.
mBatchFile = [
r'set.dir(input={0})'.format(currentDir),
r'set.dir(output={0})'.format(currentDir),
r'make.contigs(file={}{})'.format(currentDir, stabilityFileName),
r'summary.seqs(fasta={}{}.trim.contigs.fasta)'.format(currentDir, rootName),
r'screen.seqs(fasta={0}{1}.trim.contigs.fasta, group={0}{1}.contigs.groups, maxambig=0, maxhomop=5)'.format(
currentDir, rootName),
r'summary.seqs(fasta={0}{1}.trim.contigs.good.fasta)'.format(currentDir, rootName),
r'unique.seqs(fasta={0}{1}.trim.contigs.good.fasta)'.format(currentDir, rootName),
r'summary.seqs(fasta={0}{1}.trim.contigs.good.unique.fasta, name={0}{1}.trim.contigs.good.names)'.format(
currentDir, rootName),
r'split.abund(cutoff=2, fasta={0}{1}.trim.contigs.good.unique.fasta, name={0}{1}.trim.contigs.good.names, group={0}{1}.contigs.good.groups)'.format(
currentDir, rootName),
r'summary.seqs(fasta={0}{1}.trim.contigs.good.unique.abund.fasta, name={0}{1}.trim.contigs.good.abund.names)'.format(
currentDir, rootName),
r'summary.seqs(fasta={0}{1}.trim.contigs.good.unique.rare.fasta, name={0}{1}.trim.contigs.good.rare.names)'.format(
currentDir, rootName),
r'pcr.seqs(fasta={0}{1}.trim.contigs.good.unique.abund.fasta, group={0}{1}.contigs.good.abund.groups, name={0}{1}.trim.contigs.good.abund.names, oligos={0}primers.oligos, pdiffs=2, rdiffs=2)'.format(
currentDir, rootName)
]
# Write out the batch file
mBatchFilePath = r'{0}{1}{2}'.format(currentDir, 'mBatchFile', sampleName)
writeListToDestination(mBatchFilePath, mBatchFile)
error = False
# NB the mothur return code doesn't seem to work. We just get None type.
# apparently they have fixed this in the newest mothur but we have not upgraded to that yet.
# so for the time being we will check for error by hand in the stdout.
with subprocess.Popen(['mothur', '{0}'.format(mBatchFilePath)], stdout=subprocess.PIPE, bufsize=1,
universal_newlines=True) as p:
# Here look for the specific blank fasta name warning (which should be interpreted as an error)
# and any other error that may be arising
# if found, log error.
for line in p.stdout:
if debug:
print(line)
if '[WARNING]: Blank fasta name, ignoring read.' in line:
p.terminate()
errorReason = 'Blank fasta name'
logQCErrorAndContinue(dataSetSampleInstanceInQ, sampleName, errorReason)
error = True
error_sample_list.append(sampleName)
break
if 'ERROR' in line:
p.terminate()
errorReason = 'error in inital QC'
logQCErrorAndContinue(dataSetSampleInstanceInQ, sampleName, errorReason)
error = True
error_sample_list.append(sampleName)
break
if error:
continue
# Here check the outputted files to see if they are reverse complement or not by running the pcr.seqs and checking the results
# Check to see if there are sequences in the PCR output file
lastSummary = readDefinedFileToList(
'{}{}.trim.contigs.good.unique.abund.pcr.fasta'.format(currentDir, rootName))
# If this file is empty
# Then these sequences may well be reverse complement so we need to try to rev first
if len(lastSummary) == 0:
# RC batch file
mBatchRev = [
r'reverse.seqs(fasta={0}{1}.trim.contigs.good.unique.abund.fasta)'.format(currentDir, rootName),
r'pcr.seqs(fasta={0}{1}.trim.contigs.good.unique.abund.rc.fasta, group={0}{1}.contigs.good.abund.groups, name={0}{1}.trim.contigs.good.abund.names, oligos={0}primers.oligos, pdiffs=2, rdiffs=2)'.format(
currentDir, rootName)
]
mBatchFilePath = r'{0}{1}{2}'.format(currentDir, 'mBatchFile', sampleName)
# write out RC batch file
writeListToDestination(mBatchFilePath, mBatchRev)
if not debug:
completedProcess = subprocess.run(
['mothur', r'{0}'.format(mBatchFilePath)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# At this point the sequences will be reversed and they will have been renamed so we
# can just change the name of the .rc file to the orignal .fasta file that we inputted with
# This way we don't need to change the rest of the mothur pipe.
subprocess.run(
[r'mv', r'{0}{1}.trim.contigs.good.unique.abund.rc.pcr.fasta'.format(currentDir, rootName),
r'{0}{1}.trim.contigs.good.unique.abund.pcr.fasta'.format(currentDir, rootName)],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
elif debug:
completedProcess = subprocess.run(
['mothur', r'{0}'.format(mBatchFilePath)])
subprocess.run(
[r'mv', r'{0}{1}.trim.contigs.good.unique.abund.rc.pcr.fasta'.format(currentDir, rootName),
r'{0}{1}.trim.contigs.good.unique.abund.pcr.fasta'.format(currentDir, rootName)])
# Check again to see if the RC has fixed the problem of having an empty fasta
# If this file is still empty, then the problem was not solved by reverse complementing
lastSummary = readDefinedFileToList(
'{}{}.trim.contigs.good.unique.abund.pcr.fasta'.format(currentDir, rootName))
if len(lastSummary) == 0:
errorReason = 'error in inital QC'
logQCErrorAndContinue(dataSetSampleInstanceInQ, sampleName, errorReason)
error_sample_list.append(sampleName)
continue
# after having completed the RC checks redo the unique.
mBatchFileContinued = [
r'summary.seqs(fasta={0}{1}.trim.contigs.good.unique.abund.pcr.fasta, name={0}{1}.trim.contigs.good.abund.pcr.names)'.format(
currentDir, rootName),
r'unique.seqs(fasta={0}{1}.trim.contigs.good.unique.abund.pcr.fasta, name={0}{1}.trim.contigs.good.abund.pcr.names)'.format(
currentDir, rootName),
r'summary.seqs(fasta={0}{1}.trim.contigs.good.unique.abund.pcr.unique.fasta, name={0}{1}.trim.contigs.good.unique.abund.pcr.names)'.format(
currentDir, rootName)
]
mBatchFilePath = r'{0}{1}{2}'.format(currentDir, 'mBatchFile', sampleName)
writeListToDestination(mBatchFilePath, mBatchFileContinued)
completedProcess = subprocess.run(
['mothur', r'{0}'.format(mBatchFilePath)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if completedProcess.returncode == 1 or 'ERROR' in completedProcess.stdout.decode('utf-8'):
if debug:
print(completedProcess.stdout.decode('utf-8'))
errorReason = 'error in inital QC'
logQCErrorAndContinue(dataSetSampleInstanceInQ, sampleName, errorReason)
error_sample_list.append(sampleName)
continue
# Check to see if there are sequences in the PCR output file
try:
lastSummary = readDefinedFileToList(
'{}{}.trim.contigs.good.unique.abund.pcr.unique.fasta'.format(currentDir, rootName))
if len(lastSummary) == 0: # If this file is empty
errorReason = 'error in inital QC'
logQCErrorAndContinue(dataSetSampleInstanceInQ, sampleName, errorReason)
error_sample_list.append(sampleName)
continue
except: # If there is no file then we can assume sample has a problem
logQCErrorAndContinue(dataSetSampleInstanceInQ, sampleName)
continue
# Get number of sequences after make.contig
lastSummary = readDefinedFileToList('{}{}.trim.contigs.summary'.format(currentDir, rootName))
number_of_seqs_contig_absolute = len(lastSummary) - 1
dataSetSampleInstanceInQ.initialTotSeqNum = number_of_seqs_contig_absolute
sys.stdout.write('{}: dataSetSampleInstanceInQ.initialTotSeqNum = {}\n'.format(sampleName,
number_of_seqs_contig_absolute))
# Get number of sequences after unique
lastSummary = readDefinedFileToList(
'{}{}.trim.contigs.good.unique.abund.pcr.unique.summary'.format(currentDir, rootName))
number_of_seqs_contig_unique = len(lastSummary) - 1
dataSetSampleInstanceInQ.initialUniqueSeqNum = number_of_seqs_contig_unique
sys.stdout.write('{}: dataSetSampleInstanceInQ.initialUniqueSeqNum = {}\n'.format(sampleName,
number_of_seqs_contig_unique))
# Get absolute number of sequences after after sequence QC
last_summary = readDefinedFileToList(
'{}{}.trim.contigs.good.unique.abund.pcr.unique.summary'.format(currentDir, rootName))
absolute_count = 0
for line in last_summary[1:]:
absolute_count += int(line.split('\t')[6])
dataSetSampleInstanceInQ.post_seq_qc_absolute_num_seqs = absolute_count
dataSetSampleInstanceInQ.save()
sys.stdout.write('{}: dataSetSampleInstanceInQ.post_seq_qc_absolute_num_seqs = {}\n'.format(sampleName,
absolute_count))
sys.stdout.write('{}: Initial mothur complete\n'.format(sampleName))
# Each sampleDataDir should contain a set of .fasta, .name and .group files that we can use to do local blasts with
return
def perform_MED(wkd, ID, numProc, debug):
# Create mothur batch for each .fasta .name pair to be deuniqued
# Put in directory list, run via multiprocessing
samplesCollection = data_set_sample.objects.filter(dataSubmissionFrom=data_set.objects.get(id=ID))
mBatchFilePathList = []
for dataSetSampleInstance in samplesCollection: # For each samples directory
sampleName = dataSetSampleInstance.name
fullPath = '{}/{}'.format(wkd, sampleName)
#http: // stackoverflow.com / questions / 3207219 / how - to - list - all - files - of - a - directory
listOfDirs = []
for (dirpath, dirnames, filenames) in os.walk(fullPath):
listOfDirs.extend(dirnames)
break
for directory in listOfDirs:# for each cladal directory
fastaFilePath = ''
nameFilePath = ''
pathToDir = '{0}/{1}'.format(fullPath, directory)
cladeName = directory
# For each of the files in each of the Cladal directories
listOfFiles = []
for (dirpath, dirnames, filenames) in os.walk(pathToDir):
listOfFiles.extend(filenames)
break
for files in listOfFiles:
if '.fasta' in files and '.redundant' not in files:
fastaFilePath = '{0}/{1}'.format(pathToDir, files)
elif '.names' in files:
nameFilePath = '{0}/{1}'.format(pathToDir, files)
# Build a quick mBatchFile
mBatchFile = [
r'set.dir(input={0}/)'.format(pathToDir),
r'set.dir(output={0}/)'.format(pathToDir),
r'deunique.seqs(fasta={0}, name={1})'.format(fastaFilePath, nameFilePath)
]
mBatchFilePath = '{0}/{1}'.format(pathToDir, '{0}.{1}.{2}'.format(sampleName, cladeName, 'mBatchFile'))
writeListToDestination(mBatchFilePath, mBatchFile)
mBatchFilePathList.append(mBatchFilePath)
# Create the queues that will hold the mBatchFile paths
taskQueue = Queue()
doneQueue = Queue()
for mBatchFilePath in mBatchFilePathList:
taskQueue.put(mBatchFilePath)
for n in range(numProc):
taskQueue.put('STOP')
allProcesses = []
# http://stackoverflow.com/questions/8242837/django-multiprocessing-and-database-connections
for n in range(numProc):
p = Process(target=deuniqueWorker, args=(taskQueue, doneQueue, debug))
allProcesses.append(p)
p.start()
# Collect the list of deuniqued directories to use for MED analyses
listOfDeuniquedFastaPaths = []
for i in range(len(mBatchFilePathList)):
listOfDeuniquedFastaPaths.append(doneQueue.get())
for p in allProcesses:
p.join()
return listOfDeuniquedFastaPaths
def deuniqueWorker(input, output, debug):
# This currently works through a list of paths to batch files to be uniques.
# But at each of these locations once the modified deuniqued file has been written we can then perform the MED
# analysis on the file in each of the directories.
# We also want to be able to read in the results of the MED but we will not be able to do that as MP so we
# will have to save the list of directories and go through them one by one to create the sequences
for mBatchFilePath in iter(input.get, 'STOP'):
cwd = os.path.dirname(mBatchFilePath)
sampleName = cwd.split('/')[-2]
sys.stdout.write('{}: deuniqueing QCed seqs\n'.format(sampleName))
found = True
# Run the dunique
if not debug:
completedProcess = subprocess.run(['mothur', r'{0}'.format(mBatchFilePath)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
elif debug:
subprocess.run(['mothur', r'{0}'.format(mBatchFilePath)])
# Modify the deuniqued fasta to append sample name to all of the sequences
# Get list of files in directory
deuniquedFasta = []
# Replace '_' in name as MED uses text up to first underscore as sample name
# This shouldn't be necessary
#sampleName = sampleName.replace('_', '-')
listOfFiles = []
for (dirpath, dirnames, filenames) in os.walk(cwd):
listOfFiles.extend(filenames)
break
pathToFile = None
for file in listOfFiles:
if '.redundant' in file: # Then this is the deuniqued fasta
pathToFile = '{0}/{1}'.format(cwd, file)
break
deuniquedFasta = readDefinedFileToList(pathToFile)
deuniquedFasta = ['{0}{1}_{2}'.format(a[0],sampleName,a[1:].replace('_','-')) if a[0] == '>' else a for a in deuniquedFasta]
#write the modified deuniquedFasta to list
writeListToDestination(pathToFile, deuniquedFasta)
if debug:
if deuniquedFasta:
if len(deuniquedFasta) < 100:
print('WARNING the dequniqed fasta for {} is less than {} lines'.format(sampleName, len(deuniquedFasta)))
else:
print('deuniqued fasta for {} is empty'.format(sampleName))
# Put the path to the deuniqued fasta into the output list for use in MED analyses
output.put('{}/{}/'.format(os.path.dirname(pathToFile), 'MEDOUT'))
# The fasta that we want to pad and MED is the 'file'
sys.stdout.write('{}: padding alignment\n'.format(sampleName))
path_to_med_padding = os.path.join(os.path.dirname(__file__), 'lib/med_decompose/o_pad_with_gaps.py')
subprocess.run([path_to_med_padding, r'{}'.format(pathToFile)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# Now run MED
listOfFiles = []
for (dirpath, dirnames, filenames) in os.walk(cwd):
listOfFiles.extend(filenames)
break
for file in listOfFiles:
if 'PADDED' in file:
pathToFile = '{0}/{1}'.format(cwd, file)
break
MEDOutDir = '{}/{}/'.format(cwd, 'MEDOUT')
os.makedirs(MEDOutDir, exist_ok=True)
sys.stdout.write('{}: running MED\n'.format(sampleName))
# Here we need to make sure that the M value is defined dynamically
# the M value is a cutoff that looks at the abundance of the most abundant unique sequence in a node
# if the abundance is lower than M then the node is discarded
# we have been working recently with an M that equivaltes to 0.4% of 0.004. This was
# calculated when working with a modelling project where I was subsampling to 1000 sequences. In this
# scenario the M was set to 4.
# We should also take care that M doesn't go below 4, so we should use a max choice for the M
M_value = max(4, int(0.004 * (len(deuniquedFasta)/2)))
path_to_med_decompose = os.path.join(os.path.dirname(__file__), 'lib/med_decompose/decompose.py')
if not debug:
subprocess.run(
[path_to_med_decompose, '-M', str(M_value), '--skip-gexf-files', '--skip-gen-figures', '--skip-gen-html', '--skip-check-input', '-o',
MEDOutDir, pathToFile], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
elif debug:
subprocess.run(
[path_to_med_decompose, '-M', str(M_value), '--skip-gexf-files', '--skip-gen-figures', '--skip-gen-html',
'--skip-check-input', '-o',
MEDOutDir, pathToFile])
sys.stdout.write('{}: MED complete\n'.format(sampleName))
def checkIfSeqInQHadRefSeqMatch(seqInQ, nodeName, refSeqIdDict, nodeToRefDict, refSeqIDNameDict):
# seqInQ = the MED node sequence in question
# refSeqIdDict = dictionary of all current ref_sequences sequences (KEY) to their ID (VALUE).
# We use this to look to see if there is an equivalent refSeq Sequence for the sequence in question
# This take into account whether the seqInQ could be a subset or super set of one of the
# refSeq.sequences
# Will return false if no refSeq match is found
# first check to see if seq is found
if seqInQ in refSeqIdDict: # Found actual seq in dict
# assign the MED node name to the reference_sequence ID that it matches
nodeToRefDict[nodeName] = refSeqIdDict[seqInQ]
sys.stdout.write('\rAssigning MED node {} to existing reference sequence {}'.format(nodeName,
refSeqIDNameDict[refSeqIdDict[seqInQ]]))
return True
elif 'A' + seqInQ in refSeqIdDict: # This was a seq shorter than refseq but we can associate it to this ref seq
# assign the MED node name to the reference_sequence ID that it matches
nodeToRefDict[nodeName] = refSeqIdDict['A' + seqInQ]
sys.stdout.write('\rAssigning MED node {} to existing reference sequence {}'.format(nodeName, refSeqIDNameDict[
refSeqIdDict['A' + seqInQ]]))
return True
else: # This checks if either the seq in question is found in the sequence of a reference_sequence
# or if the seq in question is bigger than a refseq sequence and is a super set of it
# In either of these cases we should consider this a match and use the refseq matched to.
# This might be very coputationally expensive but lets give it a go
for ref_seq_key in refSeqIdDict.keys():
if seqInQ in ref_seq_key or ref_seq_key in seqInQ:
# Then this is a match
nodeToRefDict[nodeName] = refSeqIdDict[ref_seq_key]
sys.stdout.write('\rAssigning MED node {} to existing reference sequence {}'.format(
nodeName, refSeqIDNameDict[refSeqIdDict[ref_seq_key]]))
return True
return False
def collateFilesForMed(listofdeuniquedfastapaths, wkd):
# To avoid a memory crash we append each deuniqued fasta directly to the master fasta on disk
# This way we don't hold the master fasta in memory as a list
cladeList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
# wkd = '/'.join(listofdeuniquedfastapaths[0].split('/')[:5])
for clade in cladeList:
createNewFile('{}/deuniquedFastaForMED/redundant.{}.fasta'.format(wkd, clade))
# list will hold all of the fastas
#cladeSepFastaList = [[] for a in cladeList]
for deUFastaPath in listofdeuniquedfastapaths:
deuniquedFasta = readDefinedFileToList(deUFastaPath)
clade = deUFastaPath.split('/')[-2]
writeLinesToFile('{}/deuniquedFastaForMED/redundant.{}.fasta'.format(wkd, clade), deuniquedFasta)
# Count number of seqs in each file
# If empty, then delete
for clade in cladeList:
fastaPath = '{}/deuniquedFastaForMED/redundant.{}.fasta'.format(wkd, clade)
if checkIfFileEmpty(fastaPath): # Delete the MED input clade files that are empty
os.remove(fastaPath)
return
def create_data_set_sample_sequences_from_MED_nodes(wkd, ID, MEDDirs, debug):
''' Here we have modified the original method processMEDDataDirectCCDefinition'''
# We are going to change this so that we go to each of the MEDDirs, which represent the clades within samples
# that have had MED analyses run in them and we are going to use the below code to populate sequences to
# the CCs and samples
# in checkIfSeqInQHadRefSeqMatch method below we are currently doing lots of database look ups to
# get the names of reference_sequecnes
# this is likely quite expensive so I think it will be easier to make a dict for this purpose which is
# reference_sequence.id (KEY) reference_sequence.name (VALUE)
reference_sequence_ID_to_name_dict = {refSeq.id: refSeq.name for refSeq in reference_sequence.objects.all()}
# This is a dict of key = reference_sequence.sequence value = reference_sequence.id for all refseqs
# currently held in the database
# We will use this to see if the sequence in question has a match, or is found in (this is key
# as some of the seqs are one bp smaller than the reference seqs) there reference sequences
reference_sequence_sequence_to_ID_dict = {refSeq.sequence: refSeq.id for refSeq in reference_sequence.objects.all()}
cladeList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
for dir in MEDDirs: # For each of the directories where we did MED
os.chdir(dir)
# Get the sample
sampleName = dir.split('/')[-4]
# Get the clade
clade = dir.split('/')[-3]
sys.stdout.write('\n\nPopulating {} with clade {} sequences\n'.format(sampleName, clade))
# Read in the node file
try:
nodeFile = readDefinedFileToList('NODE-REPRESENTATIVES.fasta')
except:
# if no node file found move on to the next directory
if debug:
print('Could not locate NODE-REP file for {}'.format(dir))
continue
nodeFile = [line.replace('-', '') if line[0] != '>' else line for line in nodeFile]
# Create nodeToRefDict that will be populated
nodeToRefDict = {}
############ ASSOCIATE MED NODES TO EXISITING REFSEQS OR CREATE NEW REFSEQS #########
# Look up seq of each of the MED nodes with reference_sequence table
# See if the seq in Q matches a reference_sequence, if so, associate
if debug:
if len(nodeFile) < 10:
print('WARNING node file for {} is only {} lines'.format(dir, len(nodeFile)))
listOfRefSeqs = []
for i in range(len(nodeFile)):
# We were having a problem here where some of the seqs were 1bp shorter than the reference seqs
# As such they werent matching to the refernceSequence object e.g. to C3 but when we do the
# blast they come up with C3 as their clostest and perfect match
# To fix this we will run checkIfSeqInQHadRefSeqMatch
if nodeFile[i][0] == '>': # Then this is a def line
sequenceInQ = nodeFile[i + 1]
nodeNameInQ = nodeFile[i][1:].split('|')[0]
# If True then node name will already have been associated to nodeToRefDict
# and no need to do anything else
found = checkIfSeqInQHadRefSeqMatch(seqInQ=sequenceInQ,
refSeqIdDict=reference_sequence_sequence_to_ID_dict,
nodeName=nodeNameInQ,
nodeToRefDict=nodeToRefDict,
refSeqIDNameDict=reference_sequence_ID_to_name_dict)
if found == False:
# If there is no current match for the MED node in our current reference_sequences
# create a new reference_sequence object and add this to the refSeqDict
# Then assign the MED node to this new reference_sequence using the nodeToRefDict
newreferenceSequence = reference_sequence(clade=clade, sequence=sequenceInQ)
newreferenceSequence.save()
newreferenceSequence.name = str(newreferenceSequence.id)
newreferenceSequence.save()
listOfRefSeqs.append(newreferenceSequence)
reference_sequence_sequence_to_ID_dict[newreferenceSequence.sequence] = newreferenceSequence.id
nodeToRefDict[nodeNameInQ] = newreferenceSequence.id
reference_sequence_ID_to_name_dict[newreferenceSequence.id] = newreferenceSequence.name
sys.stdout.write('\rAssigning MED node {} to new reference sequence {}'.format(nodeFile[i][1:].split('|')[0],
newreferenceSequence.name))
########################################################################################
# # Here we have a refSeq associated to each of the seqs found and we can now create dataSetSampleSequences that have associated referenceSequences
# So at this point we have a reference_sequence associated with each of the nodes
# Now it is time to define clade collections
# Open the MED node count table as list of lists
countArray = []
nodes = []
samples = []
# this creates countArray which is a 2D list
with open('MATRIX-COUNT.txt') as f:
reader = csv.reader(f, delimiter='\t')
countArray = list(reader)
# get Nodes from first list
nodes = countArray[0][1:]
# del nodes
del countArray[0]
# get samples from first item of each list
# del samples to leave raw numerical
for i in range(len(countArray)):
samples.append(countArray[i][0])
del countArray[i][0]
# convert to np array
countArray = np.array(countArray)
countArray = countArray.astype(np.int)
# for each node in each sample create data_set_sample_sequence with foreign key to referenceSeq and data_set_sample
# give it a foreign key to the reference Seq by looking up the seq in the dictionary made earlier and using the value to search for the referenceSeq
for i in range(len(samples)): # For each sample # There should only be one sample
data_set_sample_object = data_set_sample.objects.get(dataSubmissionFrom=data_set.objects.get(id=ID),
name=samples[i])
# Add the metadata to the data_set_sample
data_set_sample_object.post_med_absolute += sum(countArray[i])
data_set_sample_object.post_med_unique += len(countArray[i])
data_set_sample_object.save()
cladalSeqAbundanceCounter = [int(a) for a in json.loads(data_set_sample_object.cladalSeqTotals)]
# This is where we need to tackle the issue of making sure we keep track of sequences in samples that
# were not above the 200 threshold to be made into cladeCollections
# We will simply add a list to the sampleObject that will be a sequence total for each of the clades
# in order of cladeList
# Here we modify the cladalSeqTotals string of the sample object to add the sequence totals
# for the given clade
cladeIndex = cladeList.index(clade)
tempInt = cladalSeqAbundanceCounter[cladeIndex]
tempInt += sum(countArray[i])
cladalSeqAbundanceCounter[cladeIndex] = tempInt
data_set_sample_object.cladalSeqTotals = json.dumps([str(a) for a in cladalSeqAbundanceCounter])
data_set_sample_object.save()
dssList = []
if sum(countArray[i]) > 200:
sys.stdout.write('\n{} clade {} sequences in {}. Creating clade_collection object\n'.format(sum(countArray[i]), sampleName, clade))
newCC = clade_collection(clade=clade, dataSetSampleFrom=data_set_sample_object)
newCC.save()
else:
sys.stdout.write(
'\n{} clade {} sequences in {}. Insufficient sequence to create a clade_collection object\n'.format(
sum(countArray[i]), clade, sampleName))
# I want to address a problem we are having here. Now that we have thorough checks to
# associate very similar sequences with indels by the primers to the same reference seq
# it means that multiple sequences within the same sample can have the same referenceseqs
# Due to the fact that we will in effect use the sequence of the reference seq rather
# than the dsss seq, we should consolidate all dsss seqs with the same reference seq
# so... we will create a counter that will keep track of the cumulative abundance associated with each reference_sequence
# and then create a dsss for each refSeq from this.
refSeqAbundanceCounter = defaultdict(int)
for j in range(len(nodes)):
abundance = countArray[i][j]
if abundance > 0:
refSeqAbundanceCounter[reference_sequence.objects.get(id=nodeToRefDict[nodes[j]])] += abundance
# > 200 associate a CC to the data_set_sample, else, don't
# irrespective, associate a data_set_sample_sequences to the data_set_sample
sys.stdout.write(
'\nAssociating clade {} data_set_sample_sequences directly to data_set_sample {}\n'.format(clade, sampleName))
if sum(countArray[i]) > 200:
for refSeq in refSeqAbundanceCounter.keys():
dss = data_set_sample_sequence(referenceSequenceOf=refSeq,
cladeCollectionTwoFoundIn=newCC,
abundance=refSeqAbundanceCounter[refSeq],
data_set_sample_from=data_set_sample_object)
dssList.append(dss)
# Save all of the newly created dss
data_set_sample_sequence.objects.bulk_create(dssList)
# Get the ids of each of the dss and add create a string of them and store it as cc.footPrint
# This way we can quickly get the footprint of the CC.
# Sadly we can't get eh IDs from the list so we will need to re-query
# Instead we add the ID of each refseq in the refSeqAbundanceCounter.keys() list
newCC.footPrint = ','.join([str(refSeq.id) for refSeq in refSeqAbundanceCounter.keys()])
newCC.save()
else:
for refSeq in refSeqAbundanceCounter.keys():
dss = data_set_sample_sequence(referenceSequenceOf=refSeq,
abundance=refSeqAbundanceCounter[refSeq],
data_set_sample_from=data_set_sample_object)
dssList.append(dss)
# Save all of the newly created dss
data_set_sample_sequence.objects.bulk_create(dssList)
return
def main(pathToInputFile, dSID, numProc, screen_sub_evalue=False,
full_path_to_nt_database_directory='/home/humebc/phylogeneticSoftware/ncbi-blast-2.6.0+/ntdbdownload',
data_sheet_path=None, noFig=False, noOrd=False, distance_method='braycurtis', debug=False):
############### UNZIP FILE, CREATE LIST OF SAMPLES AND WRITE stability.files FILE ##################
dataSubmissionInQ = data_set.objects.get(id=dSID)
cladeList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
# we will create the output dire early on so that we can use it to write out the sample by sample
# thrown away seqs.
outputDir = os.path.join(os.path.dirname(__file__), 'outputs/data_set_submissions/{}'.format(dSID))
os.makedirs(outputDir, exist_ok=True)
if dataSubmissionInQ.initialDataProcessed == False:
# Identify sample names and generate new stability file, generate data_set_sample objects in bulk
wkd, num_samples = generate_new_stability_file_and_data_set_sample_objects(cladeList, dSID, dataSubmissionInQ,
data_sheet_path, pathToInputFile)
################### PERFORM pre-MED QC #################
if screen_sub_evalue:
new_seqs_added_count, discarded_seqs_fasta = preMED_QC(dSID, dataSubmissionInQ, numProc, wkd, screen_sub_evalue, output_dir=outputDir, debug=debug)
else:
fasta_of_sig_sub_e_seqs, fasta_of_sig_sub_e_seqs_path, discarded_seqs_fasta = preMED_QC(dSID, dataSubmissionInQ, numProc, wkd, screen_sub_evalue, output_dir=outputDir, debug=debug)
# This function now performs the MEDs sample by sample, clade by clade.
# The list of outputed paths lead to the MED directories where node info etc can be found
sys.stdout.write('\n\nStarting MED analysis\n')
MEDDirs = perform_MED(dataSubmissionInQ.workingDirectory, dataSubmissionInQ.id, numProc, debug)
if debug:
print('MED dirs:')
for dir in MEDDirs:
print(dir)
create_data_set_sample_sequences_from_MED_nodes(dataSubmissionInQ.workingDirectory, dataSubmissionInQ.id, MEDDirs, debug)
# dataSubmissionInQ.dataProcessed = True
dataSubmissionInQ.currentlyBeingProcessed = False
dataSubmissionInQ.save()
### WRITE OUT REPORT OF HOW MANY SAMPLES WERE SUCCESSFULLY PROCESSED
processed_samples_status(dataSubmissionInQ, pathToInputFile)
# Here I also want to by default output a sequence drop that is a drop of the named sequences and their associated
# sequences so that we mantain a link of the sequences to the names of the sequences
perform_sequence_drop()
##### CLEAN UP tempData FOLDER #####
# get rid of the entire temp folder rather than just the individual wkd for this data submission
# just in case multiple
print('Cleaning up temp folders')
dir_to_del = os.path.abspath('{}/tempData'.format(pathToInputFile))
if os.path.exists(dir_to_del):
shutil.rmtree(dir_to_del)
# also make sure that we get rid of .logfile files in the symbiodiniumDB directory
symbiodiniumdb_dir = os.path.join(os.path.dirname(__file__), 'symbiodiniumDB')
log_file_list = [f for f in os.listdir(symbiodiniumdb_dir) if f.endswith(".logfile")]
for f in log_file_list:
os.remove('{}/{}'.format(symbiodiniumdb_dir, f))
####### COUNT TABLE OUTPUT ########
# We are going to make the sequence count table output as part of the dataSubmission
sys.stdout.write('\nGenerating count tables\n')
# the below method will create the tab delimited output table and print out the output file paths
# it will also return these paths so that we can use them to grab the data for figure plotting
output_path_list, date_time_str, num_samples = div_output_pre_analysis_new_meta_and_new_dss_structure(datasubstooutput=str(dSID),
numProcessors=numProc,
output_dir=outputDir, call_type='submission')
# also write out the fasta of the sequences that were discarded
discarded_seqs_fasta_path = '{}/discarded_seqs_{}.fasta'.format(outputDir, dSID)
writeListToDestination(discarded_seqs_fasta_path, discarded_seqs_fasta)
print('A fasta containing discarded sequences ({}) is output here:\n{}'.format(len(discarded_seqs_fasta)/2, discarded_seqs_fasta_path))
###################################
####### Stacked bar output fig #####
# here we will create a stacked bar
# I think it is easiest if we directly pass in the path of the above count table output
if not noFig:
if num_samples > 1000:
print('Too many samples ({}) to generate plots'.format(num_samples))
else:
sys.stdout.write('\nGenerating sequence count table figures\n')
for path in output_path_list:
if 'relative' in path:
path_to_rel_abund_data = path
svg_path, png_path = generate_stacked_bar_data_submission(path_to_rel_abund_data, outputDir, time_date_str=date_time_str)
sys.stdout.write('\nFigure generation complete')
sys.stdout.write('\nFigures output to:')
sys.stdout.write('\n{}'.format(svg_path))
sys.stdout.write('\n{}\n'.format(png_path))
####### between sample distances ######
if not noOrd:
print('Calculating between sample pairwise distances')
if distance_method == 'unifrac':
PCoA_paths_list = generate_within_clade_UniFrac_distances_samples(dataSubmission_str=dSID, num_processors=numProc,
method='mothur', call_type='submission', output_dir=outputDir)
elif distance_method == 'braycurtis':
PCoA_paths_list = generate_within_clade_BrayCurtis_distances_samples(dataSubmission_str=dSID, call_type='submission', output_dir=outputDir)
####### distance plotting #############
if not noFig:
if num_samples > 1000:
print('Too many samples ({}) to generate plots'.format(num_samples))
else:
for pcoa_path in PCoA_paths_list:
if 'PCoA_coords' in pcoa_path:
# then this is a full path to one of the .csv files that contains the coordinates that we can plot
# we will get the output directory from the passed in pcoa_path
sys.stdout.write('\n\nGenerating between sample distance plot clade {}\n'.format(os.path.dirname(pcoa_path).split('/')[-1]))
plot_between_sample_distance_scatter(pcoa_path)
####################################
#######################################
# write out whether there were below e value sequences outputted.
if screen_sub_evalue:
sys.stdout.write('{} sequences were added to the symClade.fa database as part of this data submission\n'.format(new_seqs_added_count))
else:
if fasta_of_sig_sub_e_seqs:
sys.stdout.write('{} distinct sequences from your submission were of questionable taxonomic origin.'
'\nSymPortal can\'t be sure that they are of Symbiodinium/Symbiodiniaceae origin despit them showing some degree of similarity to the reference sequences.'
'\nA .fasta has been output which contains these sequences here:'
'\n{}\n'.format(len(fasta_of_sig_sub_e_seqs), fasta_of_sig_sub_e_seqs_path))
else:
sys.stdout.write('There were no sub evalue sequences returned - hooray!\n')
print('data_set ID is: {}'.format(dataSubmissionInQ.id))
return dataSubmissionInQ.id
def generate_and_write_below_evalue_fasta_for_screening(dSID, dataSubmissionInQ, e_value_multiP_dict, wkd, debug):
# make fasta from the dict
fasta_out = make_evalue_screening_fasta_no_clade(dSID, e_value_multiP_dict, wkd)
# we need to know what clade each of the sequences are
# fastest way to do this is likely to run another blast on the symbiodinium clade reference dict
if fasta_out:
fasta_out_with_clade = make_evalue_screening_fasta_with_clade(dataSubmissionInQ, fasta_out, wkd, debug)
# this will return a new fasta containing only the sequences that were 'Symbiodinium' matches
# we can then output this dictionary
path_to_fasta_out_with_clade = wkd + '/below_e_cutoff_seqs_{}.fasta'.format(dSID)
writeListToDestination(path_to_fasta_out_with_clade, fasta_out_with_clade)
return fasta_out_with_clade, path_to_fasta_out_with_clade
else:
return fasta_out, None
def make_evalue_screening_fasta_with_clade(dataSubmissionInQ, fasta_out, wkd, debug):
ncbircFile = []
db_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'symbiodiniumDB'))
ncbircFile.extend(["[BLAST]", "BLASTDB={}".format(db_path)])
# write the .ncbirc file that gives the location of the db
writeListToDestination("{0}/.ncbirc".format(wkd), ncbircFile)
blastOutputPath = r'{}/blast.out'.format(wkd)
outputFmt = "6 qseqid sseqid staxids evalue"
inputPath = r'{}/blastInputFasta.fa'.format(wkd)
os.chdir(wkd)
# Run local blast
# completedProcess = subprocess.run([blastnPath, '-out', blastOutputPath, '-outfmt', outputFmt, '-query', inputPath, '-db', 'symbiodinium.fa', '-max_target_seqs', '1', '-num_threads', '1'])
if not debug:
completedProcess = subprocess.run(
['blastn', '-out', blastOutputPath, '-outfmt', outputFmt, '-query', inputPath, '-db',
dataSubmissionInQ.reference_fasta_database_used, '-max_target_seqs', '1', '-num_threads', '1'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
elif debug:
subprocess.run(
['blastn', '-out', blastOutputPath, '-outfmt', outputFmt, '-query', inputPath, '-db',
dataSubmissionInQ.reference_fasta_database_used, '-max_target_seqs', '1', '-num_threads', '1'])
# Read in blast output
blast_output_file = readDefinedFileToList(r'{}/blast.out'.format(wkd))
if debug:
if not blast_output_file:
print('WARNING blast output file is empty for evalue screening')
else:
if len(blast_output_file) < 10:
print('WARNING blast output file for evalue screening has only {} lines'.format(len(blast_output_file)))
# now create a below_e_cutoff_seq to clade dictionary
sub_e_seq_to_clade_dict = {a.split('\t')[0]: a.split('\t')[1][-1] for a in blast_output_file}
# print out the fasta with the clades appended to the end
fasta_out_with_clade = []
for line in fasta_out:
if line[0] == '>':
fasta_out_with_clade.append(line + '_clade' + sub_e_seq_to_clade_dict[line[1:]])
else:
fasta_out_with_clade.append(line)
return fasta_out_with_clade
def make_evalue_screening_fasta_no_clade(dSID, e_value_multiP_dict, wkd):
below_e_cutoff_dict = dict(e_value_multiP_dict)
temp_count = 0
fasta_out = []
for key, value in below_e_cutoff_dict.items():
if value > 2:
# then this is a sequences that was found in three or more samples
fasta_out.extend(['>sub_e_seq_count_{}_{}_{}'.format(temp_count, dSID, value), key])
temp_count += 1
if fasta_out:
writeListToDestination(wkd + '/blastInputFasta.fa', fasta_out)
return fasta_out
def perform_sequence_drop():
sequence_drop_file = generate_sequence_drop_file()
sequence_drop_path = os.path.dirname(__file__) + '/dbBackUp/seq_dumps/seq_dump_' + str(datetime.now()).replace(' ', '_',).replace(':','-')
sys.stdout.write('\n\nBackup of named reference_sequences output to {}\n'.format(sequence_drop_path))
writeListToDestination(sequence_drop_path, sequence_drop_file)
def processed_samples_status(dataSubmissionInQ, pathToInputFile):
sampleList = data_set_sample.objects.filter(dataSubmissionFrom=dataSubmissionInQ)
failedList = []
for sample in sampleList:
if sample.errorInProcessing:
failedList.append(sample.name)
readMeList = []
sumMessage = '\n\n{0} out of {1} samples successfully passed QC.\n' \
'{2} samples produced erorrs\n'.format((len(sampleList) - len(failedList)), len(sampleList),
len(failedList))
print(sumMessage)
readMeList.append(sumMessage)
for sample in sampleList:
if sample.name not in failedList:
print('Sample {} processed successfuly'.format(sample.name))
readMeList.append('Sample {} processed successfuly'.format(sample.name))
else:
print('Sample {} : {}'.format(sample.name, sample.errorReason))
for sampleName in failedList:
readMeList.append('Sample {} : ERROR in sequencing reads. Unable to process'.format(sampleName))
writeListToDestination(pathToInputFile + '/readMe.txt', readMeList)
def taxonomic_screening(wkd, dSID, numProc, dataSubmissionInQ, error_sample_list, screen_sub_e, output_dir, debug):
sampleFastQPairs = readDefinedFileToList(r'{0}/stability.files'.format(wkd))
# If we will be screening the seuqences
# At this point we should create a back up of the current symClade db.
# if not screening. no need to back up.
if screen_sub_e:
# we should be able to make this much fasta by having a list that keeps track of which samples have
# already reported having 0 sequences thrown out due to being too divergent from reference sequences
# we should populate this list when we are checking a sample in execute_worker_taxa_screening
checked_samples = []
new_seqs_add_to_symClade_db_count = 0
create_symClade_backup(dSID)
# here we continue to do screening until we add no more sub_evalue seqs to the symClade.fa ref db
# or until we find no sub_e_seqs in the first place
# a sub e seq is one that did return a match when run against the symClade database (blast) but was below
# the e value cuttoff required. We screen these to make sure that we are not thowing away symbiodinum sequences
# just because they don't have representatives in the references symCladedb.
found = True
while found :
# everytime that the execute_worker is run it pickles out the files needed for the next worker
e_value_multiP_dict, checked_samples = execute_worker_taxa_screening(dataSubmissionInQ, error_sample_list, numProc,
sampleFastQPairs, wkd, debug=debug, checked_samples=checked_samples)
if len(e_value_multiP_dict) == 0:
# then there are no sub_e_seqs to screen and we can exit
break
# this is where we will do the screening.
# the outcome of this should be an updated symClade.fa that we should then make a blastdb from
# if we indeed find that some of the sequences that were below the evalue cut off are symbiodinium
# then this should return True. else False.
# From the e_value_multiP_dict generate a fasta of the sequences that were found in more than 3 samples
# pass this to the screen_sub_e_seqs
# The number of samples a sequence must have been found in for us to consider it for screening is taken into
# account when the fasta is written out.
# This function will reuturn False if we end up with no sequences to screen
# It will outherwise reutrn the list that is the fasta that was written out.
fasta_out, fasta_out_path = generate_and_write_below_evalue_fasta_for_screening(dSID, dataSubmissionInQ,
e_value_multiP_dict, wkd, debug)
# we only need to screen if we have a fasta to screen
if fasta_out:
# here found represents whether we found any of the seqs to be symbiodinium
# if found is returned then no change has been made to the symcladedb.
found, new_seqs = screen_sub_e_seqs(data_set_id=dSID, wkd=wkd)
new_seqs_add_to_symClade_db_count += new_seqs
else:
found = False
else:
# if not doing the screening we can simply run the execute_worker_ta... once.
# during its run it will have output all of the files we need to run the following workers.
# we can also run the generate_and_write_below... function to write out a fast of significant sequences
# we can then report to the user using that object
e_value_multiP_dict = execute_worker_taxa_screening(dataSubmissionInQ, error_sample_list, numProc,
sampleFastQPairs, wkd, debug=debug)
fasta_out, fasta_out_path = generate_and_write_below_evalue_fasta_for_screening(dSID, dataSubmissionInQ,
e_value_multiP_dict, wkd, debug)
# input_q, wkd, dataSubID
# worker_taxonomy_write_out
# Create the queues that will hold the sample information
input_q = Queue()
# The list that contains the names of the samples that returned errors during the initial mothur